In [None]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from collections import Counter

# Load the datasets
train_data = pd.read_csv('/content/augmented_balanced_train_data.csv')
test_data = pd.read_csv('/content/test_data.csv')

# Initialize spaCy and tokenize the text
nlp = spacy.load('en_core_web_sm')

def tokenize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Tokenize the descriptions
train_data['description'] = train_data['description'].apply(tokenize_text)
test_data['description'] = test_data['description'].apply(tokenize_text)

# Encode the descriptions using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['description']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data['description']).toarray()

# Encode the target variable (severity)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['severity'])
y_test = label_encoder.transform(test_data['severity'])

# Check class distribution
class_distribution = Counter(y_train)
print("Class distribution in the training data:")
for class_label, count in class_distribution.items():
    print(f"Class {class_label}: {count} samples")

# Define the SVM model
def create_svm_model():
    model = svm.SVC(
        C=1.0,
        kernel='linear',  # You can experiment with 'rbf', 'poly', etc.
        decision_function_shape='ovo',  # One-vs-One for multi-class classification
        probability=True  # Needed for predict_proba method
    )
    return model

# Perform cross-validation using StratifiedKFold
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

fold = 1
for train_index, val_index in kf.split(X_train_tfidf, y_train):
    print(f"Training fold {fold}...")

    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Create the model
    model = create_svm_model()

    # Train the model
    model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the validation set
    y_pred_fold = model.predict(X_val_fold)

    # Print classification report and confusion matrix for this fold
    print(f"Classification Report for fold {fold}:")
    print(classification_report(y_val_fold, y_pred_fold, target_names=label_encoder.classes_.astype(str)))
    print(confusion_matrix(y_val_fold, y_pred_fold))

    fold += 1

# Final evaluation on the test set
print("Final evaluation on the test set:")

# Train final model
model_final = create_svm_model()
model_final.fit(X_train_tfidf, y_train)

# Get predictions on the test set
y_pred_test = model_final.predict(X_test_tfidf)

# Print classification report and confusion matrix on the test set
print(classification_report(y_test, y_pred_test, target_names=label_encoder.classes_.astype(str)))
print(confusion_matrix(y_test, y_pred_test))

# Optionally: Save the model for later use
joblib.dump(model_final, 'svm_model.pkl')
print("Model saved to svm_model.pkl")


Class distribution in the training data:
Class 0: 2313 samples
Class 1: 2313 samples
Class 2: 2313 samples
Class 3: 2313 samples
Training fold 1...
Classification Report for fold 1:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       771
           1       0.75      0.74      0.75       771
           2       0.78      0.74      0.76       771
           3       0.97      0.99      0.98       771

    accuracy                           0.86      3084
   macro avg       0.86      0.86      0.86      3084
weighted avg       0.86      0.86      0.86      3084

[[751  10  10   0]
 [ 37 573 151  10]
 [ 11 175 572  13]
 [  5   1   3 762]]
Training fold 2...
Classification Report for fold 2:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       771
           1       0.76      0.74      0.75       771
           2       0.78      0.73      0.75       771
           3       0.96      0.99   

NameError: name 'joblib' is not defined

In [None]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from collections import Counter

# Load the datasets
train_data = pd.read_csv('/content/augmented_balanced_train_data.csv')
test_data = pd.read_csv('/content/test_data.csv')

# Initialize spaCy and tokenize the text
nlp = spacy.load('en_core_web_sm')

def tokenize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Tokenize the descriptions
train_data['description'] = train_data['description'].apply(tokenize_text)
test_data['description'] = test_data['description'].apply(tokenize_text)

# Encode the descriptions using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['description']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data['description']).toarray()

# Encode the target variable (severity)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['severity'])
y_test = label_encoder.transform(test_data['severity'])

# Check class distribution
class_distribution = Counter(y_train)
print("Class distribution in the training data:")
for class_label, count in class_distribution.items():
    print(f"Class {class_label}: {count} samples")

# Define the Decision Tree model
def create_decision_tree_model():
    model = DecisionTreeClassifier(
        criterion='gini',
        max_depth=10,  # You can experiment with different depths
        random_state=42
    )
    return model

# Perform cross-validation using StratifiedKFold
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

fold = 1
for train_index, val_index in kf.split(X_train_tfidf, y_train):
    print(f"Training fold {fold}...")

    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Create the model
    model = create_decision_tree_model()

    # Train the model
    model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the validation set
    y_pred_fold = model.predict(X_val_fold)

    # Print classification report and confusion matrix for this fold
    print(f"Classification Report for fold {fold}:")
    print(classification_report(y_val_fold, y_pred_fold, target_names=label_encoder.classes_.astype(str)))
    print(confusion_matrix(y_val_fold, y_pred_fold))

    fold += 1

# Final evaluation on the test set
print("Final evaluation on the test set:")

# Train final model
model_final = create_decision_tree_model()
model_final.fit(X_train_tfidf, y_train)

# Get predictions on the test set
y_pred_test = model_final.predict(X_test_tfidf)

# Print classification report and confusion matrix on the test set
print(classification_report(y_test, y_pred_test, target_names=label_encoder.classes_.astype(str)))
print(confusion_matrix(y_test, y_pred_test))

# Optionally: Save the model for later use
import joblib
joblib.dump(model_final, 'decision_tree_model.pkl')
print("Model saved to decision_tree_model.pkl")


Class distribution in the training data:
Class 0: 2313 samples
Class 1: 2313 samples
Class 2: 2313 samples
Class 3: 2313 samples
Training fold 1...
Classification Report for fold 1:
              precision    recall  f1-score   support

           0       0.89      0.83      0.86       771
           1       0.78      0.48      0.60       771
           2       0.56      0.77      0.65       771
           3       0.80      0.87      0.83       771

    accuracy                           0.74      3084
   macro avg       0.76      0.74      0.73      3084
weighted avg       0.76      0.74      0.73      3084

[[639  13  78  41]
 [ 52 371 286  62]
 [ 25  90 591  65]
 [  6   1  96 668]]
Training fold 2...
Classification Report for fold 2:
              precision    recall  f1-score   support

           0       0.78      0.82      0.80       771
           1       0.76      0.48      0.59       771
           2       0.55      0.73      0.63       771
           3       0.77      0.77   

In [None]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter

# Load the datasets
train_data = pd.read_csv('/content/augmented_balanced_train_data.csv')
test_data = pd.read_csv('/content/test_data.csv')

# Initialize spaCy and tokenize the text
nlp = spacy.load('en_core_web_sm')

def tokenize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Tokenize the descriptions
train_data['description'] = train_data['description'].apply(tokenize_text)
test_data['description'] = test_data['description'].apply(tokenize_text)

# Encode the descriptions using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['description']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data['description']).toarray()

# Encode the target variable (severity)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['severity'])
y_test = label_encoder.transform(test_data['severity'])

# Check class distribution
class_distribution = Counter(y_train)
print("Class distribution in the training data:")
for class_label, count in class_distribution.items():
    print(f"Class {class_label}: {count} samples")

# Define the KNN model
def create_knn_model():
    model = KNeighborsClassifier(
        n_neighbors=5,  # You can experiment with different values of k
        algorithm='auto',  # You can experiment with 'ball_tree', 'kd_tree', etc.
        metric='minkowski'  # You can experiment with different distance metrics
    )
    return model

# Perform cross-validation using StratifiedKFold
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

fold = 1
for train_index, val_index in kf.split(X_train_tfidf, y_train):
    print(f"Training fold {fold}...")

    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Create the model
    model = create_knn_model()

    # Train the model
    model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the validation set
    y_pred_fold = model.predict(X_val_fold)

    # Print classification report and confusion matrix for this fold
    print(f"Classification Report for fold {fold}:")
    print(classification_report(y_val_fold, y_pred_fold, target_names=label_encoder.classes_.astype(str)))
    print(confusion_matrix(y_val_fold, y_pred_fold))

    fold += 1

# Final evaluation on the test set
print("Final evaluation on the test set:")

# Train final model
model_final = create_knn_model()
model_final.fit(X_train_tfidf, y_train)

# Get predictions on the test set
y_pred_test = model_final.predict(X_test_tfidf)

# Print classification report and confusion matrix on the test set
print(classification_report(y_test, y_pred_test, target_names=label_encoder.classes_.astype(str)))
print(confusion_matrix(y_test, y_pred_test))

# Optionally: Save the model for later use
import joblib
joblib.dump(model_final, 'knn_model.pkl')
print("Model saved to knn_model.pkl")


Class distribution in the training data:
Class 0: 2313 samples
Class 1: 2313 samples
Class 2: 2313 samples
Class 3: 2313 samples
Training fold 1...
Classification Report for fold 1:
              precision    recall  f1-score   support

           0       0.90      0.99      0.94       771
           1       0.68      0.71      0.69       771
           2       0.76      0.62      0.68       771
           3       0.96      1.00      0.98       771

    accuracy                           0.83      3084
   macro avg       0.82      0.83      0.82      3084
weighted avg       0.82      0.83      0.82      3084

[[764   3   4   0]
 [ 66 547 146  12]
 [ 16 258 478  19]
 [  0   2   1 768]]
Training fold 2...
Classification Report for fold 2:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94       771
           1       0.69      0.70      0.70       771
           2       0.76      0.61      0.68       771
           3       0.95      1.00   

In [None]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

# Load the datasets
train_data = pd.read_csv('/content/augmented_balanced_train_data.csv')
test_data = pd.read_csv('/content/test_data.csv')

# Initialize spaCy and tokenize the text
nlp = spacy.load('en_core_web_sm')

def tokenize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Tokenize the descriptions
train_data['description'] = train_data['description'].apply(tokenize_text)
test_data['description'] = test_data['description'].apply(tokenize_text)

# Encode the descriptions using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['description']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data['description']).toarray()

# Encode the target variable (severity)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['severity'])
y_test = label_encoder.transform(test_data['severity'])

# Check class distribution
class_distribution = Counter(y_train)
print("Class distribution in the training data:")
for class_label, count in class_distribution.items():
    print(f"Class {class_label}: {count} samples")

# Define the Random Forest model
def create_random_forest_model():
    model = RandomForestClassifier(
        n_estimators=100,  # Number of trees in the forest
        criterion='gini',  # Function to measure the quality of a split
        max_depth=None,  # Maximum depth of the tree
        random_state=42,
        n_jobs=-1  # Use all available cores
    )
    return model

# Perform cross-validation using StratifiedKFold
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

fold = 1
for train_index, val_index in kf.split(X_train_tfidf, y_train):
    print(f"Training fold {fold}...")

    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Create the model
    model = create_random_forest_model()

    # Train the model
    model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the validation set
    y_pred_fold = model.predict(X_val_fold)

    # Print classification report and confusion matrix for this fold
    print(f"Classification Report for fold {fold}:")
    print(classification_report(y_val_fold, y_pred_fold, target_names=label_encoder.classes_.astype(str)))
    print(confusion_matrix(y_val_fold, y_pred_fold))

    fold += 1

# Final evaluation on the test set
print("Final evaluation on the test set:")

# Train final model
model_final = create_random_forest_model()
model_final.fit(X_train_tfidf, y_train)

# Get predictions on the test set
y_pred_test = model_final.predict(X_test_tfidf)

# Print classification report and confusion matrix on the test set
print(classification_report(y_test, y_pred_test, target_names=label_encoder.classes_.astype(str)))
print(confusion_matrix(y_test, y_pred_test))

# Optionally: Save the model for later use
import joblib
joblib.dump(model_final, 'random_forest_model.pkl')
print("Model saved to random_forest_model.pkl")


Class distribution in the training data:
Class 0: 2313 samples
Class 1: 2313 samples
Class 2: 2313 samples
Class 3: 2313 samples
Training fold 1...
Classification Report for fold 1:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       771
           1       0.79      0.75      0.77       771
           2       0.80      0.80      0.80       771
           3       0.99      1.00      0.99       771

    accuracy                           0.88      3084
   macro avg       0.88      0.88      0.88      3084
weighted avg       0.88      0.88      0.88      3084

[[755  12   4   0]
 [ 39 581 147   4]
 [  5 142 620   4]
 [  0   0   3 768]]
Training fold 2...
Classification Report for fold 2:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       771
           1       0.79      0.77      0.78       771
           2       0.81      0.80      0.80       771
           3       0.99      1.00   

In [None]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

# Load the datasets
train_data = pd.read_csv('/content/augmented_balanced_train_data.csv')
test_data = pd.read_csv('/content/test_data.csv')

# Initialize spaCy and tokenize the text
nlp = spacy.load('en_core_web_sm')

def tokenize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Tokenize the descriptions
train_data['description'] = train_data['description'].apply(tokenize_text)
test_data['description'] = test_data['description'].apply(tokenize_text)

# Encode the descriptions using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['description']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data['description']).toarray()

# Encode the target variable (severity)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['severity'])
y_test = label_encoder.transform(test_data['severity'])

# Check class distribution
class_distribution = Counter(y_train)
print("Class distribution in the training data:")
for class_label, count in class_distribution.items():
    print(f"Class {class_label}: {count} samples")

# Define the Random Forest model
def create_random_forest_model():
    model = RandomForestClassifier(
        n_estimators=100,  # Number of trees in the forest
        criterion='gini',  # Function to measure the quality of a split
        max_depth=None,  # Maximum depth of the tree
        random_state=42,
        n_jobs=-1  # Use all available cores
    )
    return model

# Perform cross-validation using StratifiedKFold
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Define parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

model = create_random_forest_model()
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=kf, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train_tfidf, y_train)

# Print the best parameters and results
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_}")

# Train the final model with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_tfidf, y_train)

# Get predictions on the test set
y_pred_test = best_model.predict(X_test_tfidf)

# Print classification report and confusion matrix on the test set
print(classification_report(y_test, y_pred_test, target_names=label_encoder.classes_.astype(str)))
print(confusion_matrix(y_test, y_pred_test))

# Optionally: Save the model for later use
import joblib
joblib.dump(best_model, 'random_forest_model_with_tuning.pkl')
print("Model saved to random_forest_model_with_tuning.pkl")


Class distribution in the training data:
Class 0: 2313 samples
Class 1: 2313 samples
Class 2: 2313 samples
Class 3: 2313 samples
Fitting 3 folds for each of 108 candidates, totalling 324 fits




Best parameters found: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation accuracy: 0.8804582792909641
              precision    recall  f1-score   support

           0       0.23      0.36      0.28        28
           1       0.69      0.65      0.67       407
           2       0.78      0.80      0.79       579
           3       0.12      0.09      0.10        23

    accuracy                           0.71      1037
   macro avg       0.46      0.47      0.46      1037
weighted avg       0.71      0.71      0.71      1037

[[ 10  15   3   0]
 [ 27 266 111   3]
 [  6 101 461  11]
 [  1   3  17   2]]
Model saved to random_forest_model_with_tuning.pkl


In [2]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

# Load the dataset
data = pd.read_csv('/content/no tok +pre.csv')

# Drop rows where 'description' or 'severity' is NaN
data = data.dropna(subset=['description', 'severity'])

# Define the input and target attributes
X = data['description']
y = data['severity']

# Perform stratified sampling
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X, y):
    strat_train_set = data.iloc[train_index]
    strat_test_set = data.iloc[test_index]

# Save the splits to separate CSV files
strat_train_set.to_csv('/content/stratified_train_data.csv', index=False)
strat_test_set.to_csv('/content/stratified_test_data.csv', index=False)

print("Stratified sampling complete. Training data saved to /content/stratified_train_data.csv")
print("Test data saved to /content/stratified_test_data.csv")


Stratified sampling complete. Training data saved to /content/stratified_train_data.csv
Test data saved to /content/stratified_test_data.csv


In [11]:
import pandas as pd
import random
from nltk.corpus import wordnet
import nltk

nltk.download('wordnet')

# Load the training data
df = pd.read_csv('/content/stratified_train_data.csv')

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return list(set(synonyms))

def insertion(text, n):
    words = text.split()
    for _ in range(n):
        add_word(words)
    return ' '.join(words)

def add_word(words):
    synonyms = []
    counter = 0
    while len(synonyms) < 1:
        random_word = words[random.randint(0, len(words) - 1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return
    random_synonym = synonyms[0]
    random_idx = random.randint(0, len(words) - 1)
    words.insert(random_idx, random_synonym)

def deletion(text, p):
    words = text.split()
    if len(words) == 1:
        return text
    new_words = [word for word in words if random.uniform(0, 1) > p]
    if len(new_words) == 0:
        return random.choice(words)
    return ' '.join(new_words)

def substitution(text, n):
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:  # Only replace up to n words
            break
    return ' '.join(new_words)

def augment_text(text):
    augmented_texts = []
    augmented_texts.append(insertion(text, n=1))
    augmented_texts.append(deletion(text, p=0.1))
    augmented_texts.append(substitution(text, n=1))
    return augmented_texts

# Balance training data
max_samples = df['severity'].value_counts().max()
balanced_data = []

for severity, group in df.groupby('severity'):
    while len(group) < max_samples:
        augmented_texts = []
        for index, row in group.iterrows():
            augmented_texts.extend(augment_text(row['description']))
            if len(augmented_texts) >= max_samples - len(group):
                break
        augmented_df = pd.DataFrame({'description': augmented_texts[:max_samples - len(group)], 'severity': severity})
        group = pd.concat([group, augmented_df], ignore_index=True)
    balanced_data.append(group)

balanced_df = pd.concat(balanced_data, ignore_index=True)

# Print class distribution to make sure it is balanced
print("Class distribution in the balanced training data:")
print(balanced_df['severity'].value_counts())

# Save the balanced and augmented training data to a new CSV file
balanced_df.to_csv('/content/augmented_balanced_train_data.csv', index=False)

print("Data augmentation and balancing complete. Saved to /content/augmented_balanced_train_data.csv")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Class distribution in the balanced training data:
severity
0.0    1850
1.0    1850
2.0    1850
3.0    1850
Name: count, dtype: int64
Data augmentation and balancing complete. Saved to /content/augmented_balanced_train_data.csv


In [4]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

# Load the datasets
train_data = pd.read_csv('/content/augmented_balanced_train_data.csv')
test_data = pd.read_csv('/content/stratified_test_data.csv')

# Initialize spaCy and tokenize the text
nlp = spacy.load('en_core_web_sm')

def tokenize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Tokenize the descriptions
train_data['description'] = train_data['description'].apply(tokenize_text)
test_data['description'] = test_data['description'].apply(tokenize_text)

# Encode the descriptions using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['description']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data['description']).toarray()

# Encode the target variable (severity)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['severity'])
y_test = label_encoder.transform(test_data['severity'])

# Check class distribution
class_distribution = Counter(y_train)
print("Class distribution in the training data:")
for class_label, count in class_distribution.items():
    print(f"Class {class_label}: {count} samples")

# Define the Random Forest model
def create_random_forest_model():
    model = RandomForestClassifier(
        n_estimators=100,  # Number of trees in the forest
        criterion='gini',  # Function to measure the quality of a split
        max_depth=None,  # Maximum depth of the tree
        random_state=42,
        n_jobs=-1  # Use all available cores
    )
    return model

# Perform cross-validation using StratifiedKFold
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

fold = 1
for train_index, val_index in kf.split(X_train_tfidf, y_train):
    print(f"Training fold {fold}...")

    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Create the model
    model = create_random_forest_model()

    # Train the model
    model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the validation set
    y_pred_fold = model.predict(X_val_fold)

    # Print classification report and confusion matrix for this fold
    print(f"Classification Report for fold {fold}:")
    print(classification_report(y_val_fold, y_pred_fold, target_names=label_encoder.classes_.astype(str)))
    print(confusion_matrix(y_val_fold, y_pred_fold))

    fold += 1

# Final evaluation on the test set
print("Final evaluation on the test set:")

# Train final model
model_final = create_random_forest_model()
model_final.fit(X_train_tfidf, y_train)

# Get predictions on the test set
y_pred_test = model_final.predict(X_test_tfidf)

# Print classification report and confusion matrix on the test set
print(classification_report(y_test, y_pred_test, target_names=label_encoder.classes_.astype(str)))
print(confusion_matrix(y_test, y_pred_test))

# Optionally: Save the model for later use
import joblib
joblib.dump(model_final, 'random_forest_model.pkl')
print("Model saved to random_forest_model.pkl")


Class distribution in the training data:
Class 0: 2313 samples
Class 1: 2313 samples
Class 2: 2313 samples
Class 3: 2313 samples
Training fold 1...
Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96       771
         1.0       0.79      0.77      0.78       771
         2.0       0.82      0.80      0.81       771
         3.0       0.98      1.00      0.99       771

    accuracy                           0.89      3084
   macro avg       0.89      0.89      0.89      3084
weighted avg       0.89      0.89      0.89      3084

[[755  14   1   1]
 [ 45 591 131   4]
 [  4 140 620   7]
 [  1   0   0 770]]
Training fold 2...
Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96       771
         1.0       0.78      0.76      0.77       771
         2.0       0.80      0.79      0.80       771
         3.0       0.99      0.99   

In [5]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

# Load the datasets
train_data = pd.read_csv('/content/augmented_balanced_train_data.csv')
test_data = pd.read_csv('/content/stratified_test_data.csv')

# Initialize spaCy and tokenize the text
nlp = spacy.load('en_core_web_sm')

def tokenize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Tokenize the descriptions
train_data['description'] = train_data['description'].apply(tokenize_text)
test_data['description'] = test_data['description'].apply(tokenize_text)

# Encode the descriptions using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['description']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data['description']).toarray()

# Encode the target variable (severity)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['severity'])
y_test = label_encoder.transform(test_data['severity'])

# Check class distribution
class_distribution = Counter(y_train)
print("Class distribution in the training data:")
for class_label, count in class_distribution.items():
    print(f"Class {class_label}: {count} samples")

# Define the Random Forest model
def create_random_forest_model():
    model = RandomForestClassifier(
        n_estimators=100,  # Number of trees in the forest
        criterion='gini',  # Function to measure the quality of a split
        max_depth=None,  # Maximum depth of the tree
        random_state=42,
        n_jobs=-1  # Use all available cores
    )
    return model

# Perform cross-validation using StratifiedKFold
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold = 1
for train_index, val_index in kf.split(X_train_tfidf, y_train):
    print(f"Training fold {fold}...")

    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Create the model
    model = create_random_forest_model()

    # Train the model
    model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the validation set
    y_pred_fold = model.predict(X_val_fold)

    # Print classification report and confusion matrix for this fold
    print(f"Classification Report for fold {fold}:")
    print(classification_report(y_val_fold, y_pred_fold, target_names=label_encoder.classes_.astype(str)))
    print(confusion_matrix(y_val_fold, y_pred_fold))

    fold += 1

# Final evaluation on the test set
print("Final evaluation on the test set:")

# Train final model
model_final = create_random_forest_model()
model_final.fit(X_train_tfidf, y_train)

# Get predictions on the test set
y_pred_test = model_final.predict(X_test_tfidf)

# Print classification report and confusion matrix on the test set
print(classification_report(y_test, y_pred_test, target_names=label_encoder.classes_.astype(str)))
print(confusion_matrix(y_test, y_pred_test))

# Optionally: Save the model for later use
import joblib
joblib.dump(model_final, 'random_forest_model.pkl')
print("Model saved to random_forest_model.pkl")


Class distribution in the training data:
Class 0: 2313 samples
Class 1: 2313 samples
Class 2: 2313 samples
Class 3: 2313 samples
Training fold 1...
Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95       232
         1.0       0.82      0.76      0.79       231
         2.0       0.84      0.84      0.84       231
         3.0       0.99      1.00      0.99       232

    accuracy                           0.90       926
   macro avg       0.89      0.90      0.89       926
weighted avg       0.89      0.90      0.89       926

[[228   3   0   1]
 [ 19 175  36   1]
 [  0  35 195   1]
 [  0   0   0 232]]
Training fold 2...
Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.93      0.99      0.96       232
         1.0       0.81      0.75      0.78       231
         2.0       0.82      0.82      0.82       231
         3.0       0.99      1.00   

In [6]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

# Load the datasets
train_data = pd.read_csv('/content/augmented_balanced_train_data.csv')
test_data = pd.read_csv('/content/stratified_test_data.csv')

# Initialize spaCy and tokenize the text
nlp = spacy.load('en_core_web_sm')

def tokenize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Tokenize the descriptions
train_data['description'] = train_data['description'].apply(tokenize_text)
test_data['description'] = test_data['description'].apply(tokenize_text)

# Encode the descriptions using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['description']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data['description']).toarray()

# Encode the target variable (severity)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['severity'])
y_test = label_encoder.transform(test_data['severity'])

# Check class distribution
class_distribution = Counter(y_train)
print("Class distribution in the training data:")
for class_label, count in class_distribution.items():
    print(f"Class {class_label}: {count} samples")

# Define the Random Forest model
def create_random_forest_model():
    model = RandomForestClassifier(
        n_estimators=100,  # Number of trees in the forest
        criterion='gini',  # Function to measure the quality of a split
        max_depth=None,  # Maximum depth of the tree
        random_state=42,
        n_jobs=-1  # Use all available cores
    )
    return model

# Perform cross-validation using StratifiedKFold
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

train_accuracies = []
val_accuracies = []

fold = 1
for train_index, val_index in kf.split(X_train_tfidf, y_train):
    print(f"Training fold {fold}...")

    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Create the model
    model = create_random_forest_model()

    # Train the model
    model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the training set
    y_train_pred_fold = model.predict(X_train_fold)
    train_accuracy = accuracy_score(y_train_fold, y_train_pred_fold)
    train_accuracies.append(train_accuracy)

    # Evaluate the model on the validation set
    y_val_pred_fold = model.predict(X_val_fold)
    val_accuracy = accuracy_score(y_val_fold, y_val_pred_fold)
    val_accuracies.append(val_accuracy)

    # Print classification report and confusion matrix for this fold
    print(f"Classification Report for fold {fold}:")
    print(classification_report(y_val_fold, y_val_pred_fold, target_names=label_encoder.classes_.astype(str)))
    print(confusion_matrix(y_val_fold, y_val_pred_fold))

    fold += 1

# Final evaluation on the test set
print("Final evaluation on the test set:")

# Train final model
model_final = create_random_forest_model()
model_final.fit(X_train_tfidf, y_train)

# Get predictions on the test set
y_pred_test = model_final.predict(X_test_tfidf)

# Print classification report and confusion matrix on the test set
print(classification_report(y_test, y_pred_test, target_names=label_encoder.classes_.astype(str)))
print(confusion_matrix(y_test, y_pred_test))

# Calculate and print average training and validation accuracies
average_train_accuracy = np.mean(train_accuracies)
average_val_accuracy = np.mean(val_accuracies)
print(f"Average training accuracy: {average_train_accuracy}")
print(f"Average validation accuracy: {average_val_accuracy}")

# Optionally: Save the model for later use
import joblib
joblib.dump(model_final, 'random_forest_model.pkl')
print("Model saved to random_forest_model.pkl")


Class distribution in the training data:
Class 0: 2313 samples
Class 1: 2313 samples
Class 2: 2313 samples
Class 3: 2313 samples
Training fold 1...
Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95       232
         1.0       0.82      0.76      0.79       231
         2.0       0.84      0.84      0.84       231
         3.0       0.99      1.00      0.99       232

    accuracy                           0.90       926
   macro avg       0.89      0.90      0.89       926
weighted avg       0.89      0.90      0.89       926

[[228   3   0   1]
 [ 19 175  36   1]
 [  0  35 195   1]
 [  0   0   0 232]]
Training fold 2...
Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.93      0.99      0.96       232
         1.0       0.81      0.75      0.78       231
         2.0       0.82      0.82      0.82       231
         3.0       0.99      1.00   

In [10]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

# Load the dataset
data = pd.read_csv('/content/no tok +pre.csv')

# Drop rows where 'description' or 'severity' is NaN
data = data.dropna(subset=['description', 'severity'])

# Define the input and target attributes
X = data['description']
y = data['severity']

# Perform the first stratified sampling to split the data into training and test sets
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X, y):
    strat_train_set = data.iloc[train_index].copy()
    strat_test_set = data.iloc[test_index].copy()

# Define input and target attributes for the training set
X_train = strat_train_set['description']
y_train = strat_train_set['severity']

# Perform the second stratified sampling on the training set to create validation and training sets
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, val_index in split.split(X_train, y_train):
    final_train_set = strat_train_set.iloc[train_index].copy()
    strat_val_set = strat_train_set.iloc[val_index].copy()

# Save the splits to separate CSV files
final_train_set.to_csv('/content/augmented_balanced_train_data.csv', index=False)
strat_val_set.to_csv('/content/stratified_val_data.csv', index=False)
strat_test_set.to_csv('/content/stratified_test_data.csv', index=False)

print("Stratified sampling complete. Training data saved to /content/stratified_train_data.csv")
print("Validation data saved to /content/stratified_val_data.csv")
print("Test data saved to /content/stratified_test_data.csv")


Stratified sampling complete. Training data saved to /content/stratified_train_data.csv
Validation data saved to /content/stratified_val_data.csv
Test data saved to /content/stratified_test_data.csv


In [13]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

# Load the datasets
train_data = pd.read_csv('/content/augmented_balanced_train_data.csv')
val_data = pd.read_csv('/content/stratified_val_data.csv')
test_data = pd.read_csv('/content/stratified_test_data.csv')

# Initialize spaCy and tokenize the text
nlp = spacy.load('en_core_web_sm')

def tokenize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Tokenize the descriptions
train_data['description'] = train_data['description'].apply(tokenize_text)
val_data['description'] = val_data['description'].apply(tokenize_text)
test_data['description'] = test_data['description'].apply(tokenize_text)

# Encode the descriptions using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['description']).toarray()
X_val_tfidf = tfidf_vectorizer.transform(val_data['description']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data['description']).toarray()

# Encode the target variable (severity)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['severity'])
y_val = label_encoder.transform(val_data['severity'])
y_test = label_encoder.transform(test_data['severity'])

# Check class distribution
class_distribution = Counter(y_train)
print("Class distribution in the training data:")
for class_label, count in class_distribution.items():
    print(f"Class {class_label}: {count} samples")

# Define the Random Forest model
def create_random_forest_model():
    model = RandomForestClassifier(
        n_estimators=100,  # Number of trees in the forest
        criterion='gini',  # Function to measure the quality of a split
        max_depth=None,  # Maximum depth of the tree
        random_state=42,
        n_jobs=-1  # Use all available cores
    )
    return model

# Perform cross-validation using StratifiedKFold
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

train_accuracies = []
val_accuracies = []

fold = 1
for train_index, val_index in kf.split(X_train_tfidf, y_train):
    print(f"Training fold {fold}...")

    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Create the model
    model = create_random_forest_model()

    # Train the model
    model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the training set
    y_train_pred_fold = model.predict(X_train_fold)
    train_accuracy = accuracy_score(y_train_fold, y_train_pred_fold)
    train_accuracies.append(train_accuracy)

    # Evaluate the model on the validation set
    y_val_pred_fold = model.predict(X_val_fold)
    val_accuracy = accuracy_score(y_val_fold, y_val_pred_fold)
    val_accuracies.append(val_accuracy)

    # Print classification report and confusion matrix for this fold
    print(f"Classification Report for fold {fold}:")
    print(classification_report(y_val_fold, y_val_pred_fold, target_names=label_encoder.classes_.astype(str)))
    print(confusion_matrix(y_val_fold, y_val_pred_fold))

    fold += 1

# Train final model on full training data
final_model = create_random_forest_model()
final_model.fit(X_train_tfidf, y_train)

# Evaluate the model on the validation set
y_val_pred = final_model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation accuracy: {val_accuracy}")

# Print classification report and confusion matrix for the validation set
print(f"Classification Report for validation set:")
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_.astype(str)))
print(confusion_matrix(y_val, y_val_pred))

# Final evaluation on the test set
print("Final evaluation on the test set:")
y_test_pred = final_model.predict(X_test_tfidf)

# Print classification report and confusion matrix on the test set
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_.astype(str)))
print(confusion_matrix(y_test, y_test_pred))

# Calculate and print average training and validation accuracies
average_train_accuracy = np.mean(train_accuracies)
average_val_accuracy = np.mean(val_accuracies)
print(f"Average training accuracy: {average_train_accuracy}")
print(f"Average validation accuracy: {average_val_accuracy}")

# Optionally: Save the model for later use
import joblib
joblib.dump(final_model, 'random_forest_model.pkl')
print("Model saved to random_forest_model.pkl")


Class distribution in the training data:
Class 0: 1850 samples
Class 1: 1850 samples
Class 2: 1850 samples
Class 3: 1850 samples
Training fold 1...
Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.91      0.98      0.95       185
         1.0       0.80      0.71      0.76       185
         2.0       0.82      0.84      0.83       185
         3.0       0.99      1.00      0.99       185

    accuracy                           0.88       740
   macro avg       0.88      0.88      0.88       740
weighted avg       0.88      0.88      0.88       740

[[181   4   0   0]
 [ 17 132  35   1]
 [  0  28 156   1]
 [  0   0   0 185]]
Training fold 2...
Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.95      0.98      0.97       185
         1.0       0.81      0.74      0.77       185
         2.0       0.79      0.83      0.81       185
         3.0       1.00      1.00   

In [14]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb
from collections import Counter

# Load the datasets
train_data = pd.read_csv('/content/augmented_balanced_train_data.csv')
val_data = pd.read_csv('/content/stratified_val_data.csv')
test_data = pd.read_csv('/content/stratified_test_data.csv')

# Initialize spaCy and tokenize the text
nlp = spacy.load('en_core_web_sm')

def tokenize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Tokenize the descriptions
train_data['description'] = train_data['description'].apply(tokenize_text)
val_data['description'] = val_data['description'].apply(tokenize_text)
test_data['description'] = test_data['description'].apply(tokenize_text)

# Encode the descriptions using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['description']).toarray()
X_val_tfidf = tfidf_vectorizer.transform(val_data['description']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data['description']).toarray()

# Encode the target variable (severity)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['severity'])
y_val = label_encoder.transform(val_data['severity'])
y_test = label_encoder.transform(test_data['severity'])

# Check class distribution
class_distribution = Counter(y_train)
print("Class distribution in the training data:")
for class_label, count in class_distribution.items():
    print(f"Class {class_label}: {count} samples")

# Define the Random Forest model
def create_random_forest_model():
    model = RandomForestClassifier(
        n_estimators=100,
        criterion='gini',
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )
    return model

# Define the XGBoost model
def create_xgboost_model():
    model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        objective='multi:softmax',
        num_class=len(np.unique(y_train)),
        random_state=42
    )
    return model

# Create the ensemble model
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', create_random_forest_model()),
        ('xgb', create_xgboost_model())
    ],
    voting='soft'  # Use soft voting for probability-based voting
)

# Perform cross-validation using StratifiedKFold
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

train_accuracies = []
val_accuracies = []

fold = 1
for train_index, val_index in kf.split(X_train_tfidf, y_train):
    print(f"Training fold {fold}...")

    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Train the ensemble model
    ensemble_model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the training set
    y_train_pred_fold = ensemble_model.predict(X_train_fold)
    train_accuracy = accuracy_score(y_train_fold, y_train_pred_fold)
    train_accuracies.append(train_accuracy)

    # Evaluate the model on the validation set
    y_val_pred_fold = ensemble_model.predict(X_val_fold)
    val_accuracy = accuracy_score(y_val_fold, y_val_pred_fold)
    val_accuracies.append(val_accuracy)

    # Print classification report and confusion matrix for this fold
    print(f"Classification Report for fold {fold}:")
    print(classification_report(y_val_fold, y_val_pred_fold, target_names=label_encoder.classes_.astype(str)))
    print(confusion_matrix(y_val_fold, y_val_pred_fold))

    fold += 1

# Train final model on full training data
final_model = VotingClassifier(
    estimators=[
        ('rf', create_random_forest_model()),
        ('xgb', create_xgboost_model())
    ],
    voting='soft'
)
final_model.fit(X_train_tfidf, y_train)

# Evaluate the model on the validation set
y_val_pred = final_model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation accuracy: {val_accuracy}")

# Print classification report and confusion matrix for the validation set
print(f"Classification Report for validation set:")
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_.astype(str)))
print(confusion_matrix(y_val, y_val_pred))

# Final evaluation on the test set
print("Final evaluation on the test set:")
y_test_pred = final_model.predict(X_test_tfidf)

# Print classification report and confusion matrix on the test set
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_.astype(str)))
print(confusion_matrix(y_test, y_test_pred))

# Calculate and print average training and validation accuracies
average_train_accuracy = np.mean(train_accuracies)
average_val_accuracy = np.mean(val_accuracies)
print(f"Average training accuracy: {average_train_accuracy}")
print(f"Average validation accuracy: {average_val_accuracy}")

# Optionally: Save the model for later use
import joblib
joblib.dump(final_model, 'ensemble_model.pkl')
print("Model saved to ensemble_model.pkl")


Class distribution in the training data:
Class 0: 1850 samples
Class 1: 1850 samples
Class 2: 1850 samples
Class 3: 1850 samples
Training fold 1...
Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.91      0.98      0.95       185
         1.0       0.80      0.69      0.74       185
         2.0       0.80      0.82      0.81       185
         3.0       0.97      1.00      0.98       185

    accuracy                           0.87       740
   macro avg       0.87      0.87      0.87       740
weighted avg       0.87      0.87      0.87       740

[[182   3   0   0]
 [ 18 128  37   2]
 [  0  29 152   4]
 [  0   0   0 185]]
Training fold 2...
Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.92      0.99      0.95       185
         1.0       0.83      0.74      0.78       185
         2.0       0.80      0.80      0.80       185
         3.0       0.97      1.00   

In [4]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from collections import Counter

# Load the datasets
train_data = pd.read_csv('/content/strat augmented_balanced_train_data.csv')
val_data = pd.read_csv('/content/stratified_val_data.csv')
test_data = pd.read_csv('/content/stratified_val_data.csv')

# Initialize spaCy and tokenize the text
nlp = spacy.load('en_core_web_sm')

def tokenize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Tokenize the descriptions
train_data['description'] = train_data['description'].apply(tokenize_text)
val_data['description'] = val_data['description'].apply(tokenize_text)
test_data['description'] = test_data['description'].apply(tokenize_text)

# Encode the descriptions using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['description']).toarray()
X_val_tfidf = tfidf_vectorizer.transform(val_data['description']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data['description']).toarray()

# Encode the target variable (severity)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['severity'])
y_val = label_encoder.transform(val_data['severity'])
y_test = label_encoder.transform(test_data['severity'])

# Check class distribution
class_distribution = Counter(y_train)
print("Class distribution in the training data:")
for class_label, count in class_distribution.items():
    print(f"Class {class_label}: {count} samples")

# Define the Random Forest model
def create_random_forest_model():
    model = RandomForestClassifier(
        n_estimators=100,  # Number of trees in the forest
        criterion='gini',  # Function to measure the quality of a split
        max_depth=None,  # Maximum depth of the tree
        random_state=42,
        n_jobs=-1  # Use all available cores
    )
    return model

# Create the Bagging model with Random Forest
bagging_model = BaggingClassifier(
    estimator=create_random_forest_model(),
    n_estimators=10,  # Number of base estimators in the ensemble
    random_state=42
)

# Perform cross-validation using StratifiedKFold
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

train_accuracies = []
val_accuracies = []

fold = 1
for train_index, val_index in kf.split(X_train_tfidf, y_train):
    print(f"Training fold {fold}...")

    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train_tfidf[train_index], X_train_tfidf[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Train the Bagging model
    bagging_model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the training set
    y_train_pred_fold = bagging_model.predict(X_train_fold)
    train_accuracy = accuracy_score(y_train_fold, y_train_pred_fold)
    train_accuracies.append(train_accuracy)

    # Evaluate the model on the validation set
    y_val_pred_fold = bagging_model.predict(X_val_fold)
    val_accuracy = accuracy_score(y_val_fold, y_val_pred_fold)
    val_accuracies.append(val_accuracy)

    # Print classification report and confusion matrix for this fold
    print(f"Classification Report for fold {fold}:")
    print(classification_report(y_val_fold, y_val_pred_fold, target_names=label_encoder.classes_.astype(str)))
    print(confusion_matrix(y_val_fold, y_val_pred_fold))

    fold += 1

# Train final model on full training data
bagging_model.fit(X_train_tfidf, y_train)

# Evaluate the model on the validation set
y_val_pred = bagging_model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation accuracy: {val_accuracy}")

# Print classification report and confusion matrix for the validation set
print(f"Classification Report for validation set:")
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_.astype(str)))
print(confusion_matrix(y_val, y_val_pred))

# Final evaluation on the test set
print("Final evaluation on the test set:")
y_test_pred = bagging_model.predict(X_test_tfidf)

# Print classification report and confusion matrix on the test set
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_.astype(str)))
print(confusion_matrix(y_test, y_test_pred))

# Calculate and print average training and validation accuracies
average_train_accuracy = np.mean(train_accuracies)
average_val_accuracy = np.mean(val_accuracies)
print(f"Average training accuracy: {average_train_accuracy}")
print(f"Average validation accuracy: {average_val_accuracy}")

# Optionally: Save the model for later use
import joblib
joblib.dump(bagging_model, 'random_forest_bagging_model.pkl')
print("Model saved to random_forest_bagging_model.pkl")


Class distribution in the training data:
Class 0: 1850 samples
Class 1: 1850 samples
Class 2: 1850 samples
Class 3: 1850 samples
Training fold 1...
Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95       185
         1.0       0.81      0.70      0.75       185
         2.0       0.81      0.83      0.82       185
         3.0       0.98      1.00      0.99       185

    accuracy                           0.88       740
   macro avg       0.88      0.88      0.88       740
weighted avg       0.88      0.88      0.88       740

[[185   0   0   0]
 [ 18 130  36   1]
 [  0  30 153   2]
 [  0   0   0 185]]
Training fold 2...
Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.94      0.99      0.97       185
         1.0       0.81      0.72      0.76       185
         2.0       0.78      0.83      0.80       185
         3.0       1.00      1.00   