# Log

# Data Preprocessing

In [152]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder



In [153]:
# Step 1: Load the "weblog.csv" dataset
dataset_path = "weblog.csv"
df = pd.read_csv(dataset_path)

# Step 2: Inspect the dataset
print("Dataset Info:")
print(df.info())



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16007 entries, 0 to 16006
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   IP      16007 non-null  object
 1   Time    16007 non-null  object
 2   URL     16007 non-null  object
 3   Staus   16007 non-null  object
dtypes: object(4)
memory usage: 500.3+ KB
None


In [154]:
# Rename the "Staus" column to "Status"
df.rename(columns={"Staus": "Status"}, inplace=True)

# Display the updated DataFrame
print(df.head())

           IP                   Time  \
0  10.128.2.1  [29/Nov/2017:06:58:55   
1  10.128.2.1  [29/Nov/2017:06:59:02   
2  10.128.2.1  [29/Nov/2017:06:59:03   
3  10.131.2.1  [29/Nov/2017:06:59:04   
4  10.130.2.1  [29/Nov/2017:06:59:06   

                                             URL Status  
0                        GET /login.php HTTP/1.1    200  
1                     POST /process.php HTTP/1.1    302  
2                         GET /home.php HTTP/1.1    200  
3          GET /js/vendor/moment.min.js HTTP/1.1    200  
4  GET /bootstrap-3.3.7/js/bootstrap.js HTTP/1.1    200  


In [155]:
print("\nFirst few rows of the dataset:")
print(df.head())

# Step 3: Data Cleaning and Handling Missing Values
df.dropna(inplace=True)  # Remove rows with missing values



First few rows of the dataset:
           IP                   Time  \
0  10.128.2.1  [29/Nov/2017:06:58:55   
1  10.128.2.1  [29/Nov/2017:06:59:02   
2  10.128.2.1  [29/Nov/2017:06:59:03   
3  10.131.2.1  [29/Nov/2017:06:59:04   
4  10.130.2.1  [29/Nov/2017:06:59:06   

                                             URL Status  
0                        GET /login.php HTTP/1.1    200  
1                     POST /process.php HTTP/1.1    302  
2                         GET /home.php HTTP/1.1    200  
3          GET /js/vendor/moment.min.js HTTP/1.1    200  
4  GET /bootstrap-3.3.7/js/bootstrap.js HTTP/1.1    200  


In [156]:
df['Time'] = df['Time'].str.replace('[', '', regex=False)
df['Time']


0        29/Nov/2017:06:58:55
1        29/Nov/2017:06:59:02
2        29/Nov/2017:06:59:03
3        29/Nov/2017:06:59:04
4        29/Nov/2017:06:59:06
                 ...         
16002    02/Mar/2018:15:47:12
16003    02/Mar/2018:15:47:23
16004    02/Mar/2018:15:47:32
16005    02/Mar/2018:15:47:35
16006    02/Mar/2018:15:47:46
Name: Time, Length: 16007, dtype: object

In [157]:
time_components = df['Time'].str.split(':', expand=True)
df['Date'] = time_components[0]
df['Date']


0        29/Nov/2017
1        29/Nov/2017
2        29/Nov/2017
3        29/Nov/2017
4        29/Nov/2017
            ...     
16002    02/Mar/2018
16003    02/Mar/2018
16004    02/Mar/2018
16005    02/Mar/2018
16006    02/Mar/2018
Name: Date, Length: 16007, dtype: object

In [158]:
df['TimeOfDay'] = time_components[1]
df['TimeOfDay']

0        06
1        06
2        06
3        06
4        06
         ..
16002    15
16003    15
16004    15
16005    15
16006    15
Name: TimeOfDay, Length: 16007, dtype: object

In [159]:
# Drop the original 'Time' column
df.drop(columns=['Time'], inplace=True)


In [160]:
# Step 5: Encode categorical features
label_encoder = LabelEncoder()

# Encode URLs
df['URL_Encoded'] = label_encoder.fit_transform(df['URL'])




In [161]:
# Encode HTTP methods
df['Method_Encoded'] = label_encoder.fit_transform(df['URL'])



In [162]:
# Display preprocessed data
print("\nPreprocessed dataset:")
print(df.head())


Preprocessed dataset:
           IP                                            URL Status  \
0  10.128.2.1                        GET /login.php HTTP/1.1    200   
1  10.128.2.1                     POST /process.php HTTP/1.1    302   
2  10.128.2.1                         GET /home.php HTTP/1.1    200   
3  10.131.2.1          GET /js/vendor/moment.min.js HTTP/1.1    200   
4  10.130.2.1  GET /bootstrap-3.3.7/js/bootstrap.js HTTP/1.1    200   

          Date TimeOfDay  URL_Encoded  Method_Encoded  
0  29/Nov/2017        06          193             193  
1  29/Nov/2017        06          308             308  
2  29/Nov/2017        06          182             182  
3  29/Nov/2017        06          191             191  
4  29/Nov/2017        06           40              40  


# Rule Definition

Defining compliance rules and standards is a critical step in your project. You need to translate these rules into a structured format that can be understood by your system for automated analysis. Let's break down this step:

Define Compliance Rules: Before writing code, make sure you have a clear understanding of the compliance rules and standards that you want to enforce. These could be related to data security, access controls, user privileges, system configurations, and more.

Structured Format: You can represent each compliance rule in your dataset using a dictionary structure or a DataFrame. Each rule could have attributes like the rule description, the conditions to check, associated keywords, categories, and priority levels.

Categorization and Priority: If your organization has different categories of compliance rules (e.g., security, privacy, operational), you can organize them accordingly. Similarly, you can assign priority levels to each rule to determine the severity of non-compliance.

In [163]:
import pandas as pd

# Define compliance rules in a DataFrame
compliance_rules = pd.DataFrame({
    'Rule Description': [
        'Ensure strong passwords are used',
        'Monitor failed login attempts',
        'Restrict unauthorized access'
    ],
    'Conditions': [
        'Password length >= 8 characters',
        'Failed login attempts >= 5 within 1 hour',
        'Access to sensitive data requires multi-factor authentication'
    ],
    'Keywords': [
        'password', 'login', 'access'
    ],
    'Category': [
        'Security', 'Security', 'Access Control'
    ],
    'Priority': [
        'High', 'Medium', 'High'
    ]
})

# Display the compliance rules DataFrame
print(compliance_rules)


                   Rule Description  \
0  Ensure strong passwords are used   
1     Monitor failed login attempts   
2      Restrict unauthorized access   

                                          Conditions  Keywords  \
0                    Password length >= 8 characters  password   
1           Failed login attempts >= 5 within 1 hour     login   
2  Access to sensitive data requires multi-factor...    access   

         Category Priority  
0        Security     High  
1        Security   Medium  
2  Access Control     High  


# Model Selection and Development

The choice of the large language model (LLM) depends on the specific requirements of your project and the nature of the text data you're working with. bert-base-uncased is a commonly used LLM that is pretrained on a large corpus of text and is a good starting point for many NLP tasks. It's designed for uncased text, meaning all the text is lowercased before training, which can be helpful for tasks where the case of the words might not matter much.

However, there are several other pre-trained LLMs available through the Hugging Face Transformers library that you can consider based on your project's requirements. Some alternatives include:

bert-base-cased: Similar to bert-base-uncased, but retains the case of the text. This might be suitable if preserving the original case is important for your task.

roberta-base: This is another popular model that is based on the BERT architecture but uses a modified training approach. It has shown strong performance on various NLP benchmarks.

distilbert-base-uncased: This is a smaller and faster version of BERT that still provides competitive performance on various tasks. It's useful when computational resources are limited.

xlnet-base-cased: XLNet is another architecture that has achieved state-of-the-art results on multiple NLP benchmarks. It's known for its autoregressive training approach.

gpt2: While originally designed for text generation, GPT-2 can also be fine-tuned for classification tasks. It might be useful if you're interested in text generation alongside compliance classification.

The choice of LLM should take into account factors such as model size, training data, computational resources, and task-specific requirements. It's recommended to experiment with different models and evaluate their performance on your specific dataset to determine the best fit for your compliance monitoring task.

In [164]:
from sklearn.model_selection import train_test_split
from transformers import pipeline, BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [165]:
# Define your LLM and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [166]:
from sklearn.model_selection import train_test_split

# Split the dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(df['URL'], df['Status'], test_size=0.2, random_state=42)


In [167]:
# Tokenize the texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)

In [168]:
print(train_labels.unique())
print(test_labels.unique())


['302' '200' '304' '206' 'No' '404' '2018]' '2017]' 'dumped' 'Aborted'
 'Assertion' 'Segmentation' 'found']
['200' '302' '304' 'No' '404' '2018]' '206' '2017]' 'Assertion' 'dumped']


In [169]:
# Map compliance labels to numerical values
label_mapping = {'non-compliant': 0, 'compliant': 1}


In [170]:
# Apply label encoding to the 'Status' column (or the correct column name)
df['Status'] = df['Status'].map(label_mapping)

In [171]:
# Remove instances with unexpected labels (if needed)
valid_labels = ['non-compliant', 'compliant']
df = df[df['Status'].isin(valid_labels)]

In [172]:
df.head()

Unnamed: 0,IP,URL,Status,Date,TimeOfDay,URL_Encoded,Method_Encoded


In [173]:
# Split the dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(df['URL'], df['Status'], test_size=0.2, random_state=42)

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
import torch

# Convert labels to tensors
train_labels = torch.tensor(train_labels.tolist())
test_labels = torch.tensor(test_labels.tolist())


ValueError: too many dimensions 'str'

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=500,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
)

In [None]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=test_encodings,
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
# Evaluate the model
predictions = trainer.predict(test_encodings)
predicted_labels = np.argmax(predictions.predictions, axis=1)

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels)
recall = recall_score(test_labels, predicted_labels)
f1 = f1_score(test_labels, predicted_labels)

In [None]:
# Print the evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")