In [None]:
import pandas as pd
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the data from the specified CSV file
input_file_path = '/content/critical  labae encodee.xlsx'
df = pd.read_excel(input_file_path)

# Ensure "description" column exists
if 'description' not in df.columns:
    raise ValueError("The 'description' column is missing from the CSV file.")

# Tokenize the descriptions and create input IDs and attention masks
tokenized_data = tokenizer(
    df['description'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='np'  # Use 'pt' for PyTorch tensors if needed
)

# Add input IDs and attention masks to the DataFrame
df['input_ids'] = tokenized_data['input_ids'].tolist()
df['attention_mask'] = tokenized_data['attention_mask'].tolist()

# Save the DataFrame to a new CSV file
output_file_path = '/content/1s.csv'
df.to_csv(output_file_path, index=False)

print(f"Tokenized data saved to '{output_file_path}'.")

Tokenized data saved to '/content/1s.csv'.


In [None]:
import pandas as pd

# Load the dataset
data_path = '/content/critical  labae encodee.xlsx'
data = pd.read_excel(data_path)

# Display the first few rows to check the data
print(data.head())

# Lowercase the 'description' column
data['description'] = data['description'].str.lower()

# Save the modified DataFrame to a new CSV file
output_path = '/content/1.csv'
data.to_csv(output_path, index=False)

print(f'Modified data saved to: {output_path}')

                                         description  severity 
0  Improper Input Validation vulnerability in the...          3
1  Improper input validation in some Intel(R) Neu...          3
2  This package provides universal methods to use...          3
3  Zabbix server can perform command execution fo...          3
4  Discord-Recon is a Discord bot created to auto...          3
Modified data saved to: /content/1.csv


In [None]:
import pandas as pd
import spacy

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Load the dataset
data_path = '/content/1.csv'
data = pd.read_csv(data_path)

# Display the first few rows to check the data
print(data.head())

# Define a function to remove stop words
def remove_stopwords(text):
    doc = nlp(text)
    filtered_text = ' '.join(token.text for token in doc if not token.is_stop)
    return filtered_text

# Apply the function to the 'description' column
data['description'] = data['description'].apply(remove_stopwords)

# Save the modified DataFrame to a new CSV file
output_path = '/content/2.csv'
data.to_csv(output_path, index=False)

print(f'Modified data saved to: {output_path}')

                                         description  severity 
0  improper input validation vulnerability in the...          3
1  improper input validation in some intel(r) neu...          3
2  this package provides universal methods to use...          3
3  zabbix server can perform command execution fo...          3
4  discord-recon is a discord bot created to auto...          3
Modified data saved to: /content/2.csv


In [None]:
import pandas as pd
import spacy

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Load the dataset
data_path = '/content/2.csv'
data = pd.read_csv(data_path)

# Display the first few rows to check the data
print(data.head())

# Define a function to lemmatize text
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = ' '.join(token.lemma_ for token in doc)
    return lemmatized_text

# Apply the function to the 'description' column
data['description'] = data['description'].apply(lemmatize_text)

# Save the modified DataFrame to a new CSV file
output_path = '/content/3.csv'
data.to_csv(output_path, index=False)

print(f'Modified data saved to: {output_path}')


                                         description  severity 
0  improper input validation vulnerability upload...          3
1  improper input validation intel(r ) neural com...          3
2  package provides universal methods use multipl...          3
3  zabbix server perform command execution config...          3
4  discord - recon discord bot created automate b...          3
Modified data saved to: /content/3.csv


In [None]:
import pandas as pd
import spacy

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Load the dataset
data_path = '/content/3.csv'
data = pd.read_csv(data_path)

# Display the first few rows to check the data
print(data.head())

# Define a function to tokenize text
def tokenize_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]  # Extract tokens
    return ' '.join(tokens)  # Join tokens back into a string if needed

# Apply the function to the 'description' column
data['description'] = data['description'].apply(tokenize_text)

# Save the modified DataFrame to a new CSV file
output_path = '/content/num$4.csv'
data.to_csv(output_path, index=False)

print(f'Modified data saved to: {output_path}')

                                         description  severity 
0  improper input validation vulnerability upload...          3
1  improper input validation intel(r ) neural com...          3
2  package provide universal method use multiple ...          3
3  zabbix server perform command execution config...          3
4  discord - recon discord bot create automate bu...          3
Modified data saved to: /content/num$4.csv


In [None]:
import pandas as pd
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the CSV file
file_path = '/content/3.csv'
data = pd.read_csv(file_path)

# Tokenize the 'description' column using spaCy
data['description'] = data['description'].apply(lambda x: [token.text for token in nlp(x)])

# Save the modified DataFrame back to a CSV file
output_path = '/content/4.csv'
data.to_csv(output_path, index=False)

print(f"Tokenized data saved to {output_path}")

Tokenized data saved to /content/4.csv


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load your dataset
data = pd.read_csv('/content/4.csv')

# Separate the textual value
x = data['description'].values

# Convert textual data into feature vector
vectorizer = TfidfVectorizer()
vectorizer.fit(x)
x_transformed = vectorizer.transform(x)

# Save the encoded data with all values of one row in a single cell
with open('/content/aug+pre tfi.csv', 'w') as f:
    f.write("description\n")  # Column header
    for i in range(x_transformed.shape[0]):
        row_data = " ".join([f"({i}, {idx}){value}" for idx, value in zip(x_transformed[i].indices, x_transformed[i].data)])
        f.write(f'"{row_data}"\n')  # Add quotes to ensure single cell

# Display the first few lines of the saved file
with open('/content/aug+pre tfi.csv') as f:
    for _ in range(10):
        print(f.readline())


description

"(0, 446)0.08774108373334055 (0, 617)0.1349129069352227 (0, 750)0.17450338662545228 (0, 1863)0.08783849655790449 (0, 2045)0.22416695494608804 (0, 3675)0.26364737202916333 (0, 3880)0.2426597666067508 (0, 4576)0.2975768640590574 (0, 4821)0.3194525240773637 (0, 5235)0.3323119822946025 (0, 5273)0.23464207906805085 (0, 6188)0.31468165089018885 (0, 6634)0.12454124739475962 (0, 7027)0.3460277180692651 (0, 8315)0.20031983231494463 (0, 8382)0.10941387967275404 (0, 8495)0.20145177960315394 (0, 8624)0.09468087852617457 (0, 8693)0.184227942578642 (0, 9138)0.17703527553651446"

"(1, 153)0.36948545809062794 (1, 715)0.15907484616272488 (1, 750)0.20936628419154552 (1, 1637)0.25375951154333476 (1, 1863)0.10538717894783813 (1, 1916)0.07494095117789706 (1, 2474)0.2999104351516545 (1, 3100)0.11646949043219784 (1, 3716)0.2696594182555352 (1, 4385)0.20279825755413514 (1, 4467)0.23395524970684467 (1, 4700)0.26619790563151374 (1, 6285)0.29054266587638655 (1, 6634)0.1494225338536649 (1, 6672)0.099

Shape of X: (1970, 199)
Shape of y: (1970,)
Cross-validation accuracy scores: [0.84130435 0.82826087 0.83006536]
Mean cross-validation accuracy: 0.8332101922894762


AxisError: axis 1 is out of bounds for array of dimension 1

In [None]:
pip install scikeras


Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


Shape of X: (7154, 1064)
Shape of y: (7154,)
Cross-validation accuracy scores: [0.63273453 0.61676647 0.62037962 0.63836164 0.61838162]
Mean cross-validation accuracy: 0.6253247750253739
Accuracy: 0.68
F1 Score: 0.68
Precision: 0.69
Recall: 0.68

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.77      0.86       239
           1       0.55      0.54      0.54       306
           2       0.65      0.74      0.69       434
           3       0.69      0.62      0.65        95

    accuracy                           0.68      1074
   macro avg       0.71      0.67      0.69      1074
weighted avg       0.69      0.68      0.68      1074



In [None]:
pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset from an Excel file
file_path = '/content/mixed.xlsx'
data = pd.read_excel(file_path)

# Fill missing values in 'description' field with an empty string
data['description'] = data['description'].fillna('')

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['description'])
X = tokenizer.texts_to_sequences(data['description'])
X = pad_sequences(X, padding='post')

# Print the shape to verify
print("Shape of X:", X.shape)
print("Shape of y:", data['severity'].shape)

# Encode the target variable
y = data['severity']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Stratified sampling for train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Define the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform 5-fold cross-validation
cv = StratifiedKFold(n_splits=5)
cv_results = cross_val_score(rf_model, X_train, y_train, cv=cv, scoring='accuracy')

# Print cross-validation results
print(f"Cross-validation accuracy scores: {cv_results}")
print(f"Mean cross-validation accuracy: {np.mean(cv_results)}")

# Fit the model on the entire training set
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')

# Print the classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Shape of X: (7154, 1064)
Shape of y: (7154,)
Cross-validation accuracy scores: [0.71357285 0.70159681 0.7002997  0.72327672 0.6973027 ]
Mean cross-validation accuracy: 0.7072097563115527
Accuracy: 0.70
F1 Score: 0.70
Precision: 0.71
Recall: 0.70

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.82      0.84       239
           1       0.61      0.55      0.57       306
           2       0.68      0.79      0.73       434
           3       0.76      0.54      0.63        95

    accuracy                           0.70      1074
   macro avg       0.73      0.67      0.69      1074
weighted avg       0.71      0.70      0.70      1074



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset from an Excel file
file_path = '/content/mixed.xlsx'
data = pd.read_excel(file_path)

# Fill missing values in 'description' field with an empty string
data['description'] = data['description'].fillna('')

# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=1000)
X = tfidf.fit_transform(data['description']).toarray()

# Print the shape to verify
print("Shape of X:", X.shape)
print("Shape of y:", data['severity'].shape)

# Encode the target variable
y = data['severity']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Stratified sampling for train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Define the Random Forest model with hyperparameter tuning
rf_model = RandomForestClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_

# Perform 5-fold cross-validation
cv = StratifiedKFold(n_splits=5)
cv_results = cross_val_score(best_rf_model, X_train, y_train, cv=cv, scoring='accuracy')

# Print cross-validation results
print(f"Cross-validation accuracy scores: {cv_results}")
print(f"Mean cross-validation accuracy: {np.mean(cv_results)}")

# Fit the model on the entire training set
best_rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = best_rf_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')

# Print the classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Shape of X: (7154, 1000)
Shape of y: (7154,)


  _data = np.array(data, dtype=dtype, copy=copy,


Cross-validation accuracy scores: [0.79041916 0.78642715 0.76523477 0.77122877 0.77622378]
Mean cross-validation accuracy: 0.7779067240145084
Accuracy: 0.80
F1 Score: 0.79
Precision: 0.80
Recall: 0.80

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.85      0.89       239
           1       0.74      0.64      0.68       306
           2       0.75      0.88      0.81       434
           3       0.87      0.78      0.82        95

    accuracy                           0.80      1074
   macro avg       0.82      0.79      0.80      1074
weighted avg       0.80      0.80      0.79      1074



Data has been successfully saved to /content/mixed.csv
