In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import numpy as np
from scipy.stats import uniform, randint

# Load the Excel data
file_path = r"/content/roberta_embeddings exal (1).xlsx"
data = pd.read_excel(file_path)

# Display column names to confirm structure
print("Column names:", data.columns)

# Define features and target
X = data.drop(columns=['Class'])  # Adjust to the actual feature columns
y = data['Class']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models and parameter distributions for RandomizedSearchCV
models = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'model__C': uniform(1e-4, 1e4),
            'model__penalty': ['l2'],
            'model__solver': ['liblinear']
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': randint(2, 15),
            'model__min_samples_leaf': randint(1, 10)
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'model__C': uniform(1e-3, 1e3),
            'model__gamma': ['scale', 'auto'],
            'model__kernel': ['linear', 'rbf']
        }
    },
    'GradientBoostingClassifier': {
        'model': GradientBoostingClassifier(),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__learning_rate': uniform(0.01, 0.2),
            'model__max_depth': [3, 5, 7]
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'model__n_neighbors': randint(3, 20),
            'model__weights': ['uniform', 'distance'],
            'model__p': [1, 2]
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': randint(2, 15),
            'model__min_samples_leaf': randint(1, 10)
        }
    },
    'GaussianNB': {
        'model': GaussianNB(),
        'params': {}  # No hyperparameters for GaussianNB
    },
    'XGBClassifier': {
        'model': XGBClassifier(eval_metric='logloss', use_label_encoder=False),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__learning_rate': uniform(0.01, 0.2),
            'model__max_depth': randint(3, 10)
        }
    },
    'LGBMClassifier': {
        'model': LGBMClassifier(),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__learning_rate': uniform(0.01, 0.2),
            'model__max_depth': [-1, 10, 20]
        }
    },
    'CatBoostClassifier': {
        'model': CatBoostClassifier(verbose=0),
        'params': {
            'model__iterations': randint(50, 200),
            'model__learning_rate': uniform(0.01, 0.2),
            'model__depth': randint(4, 10)
        }
    }
}

# Perform RandomizedSearchCV for each model
best_models = {}
for model_name, model_info in models.items():
    print(f"Tuning {model_name}...")
    pipe = Pipeline([
        ('scaler', StandardScaler()),  # Applies scaling
        ('model', model_info['model'])
    ])

    search = RandomizedSearchCV(pipe, model_info['params'], n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, verbose=1, random_state=42)
    search.fit(X_train, y_train)

    best_models[model_name] = search.best_estimator_
    print(f"Best parameters for {model_name}: {search.best_params_}")
    print(f"Best cross-validation score for {model_name}: {search.best_score_:.4f}\n")

# Evaluate best models on the test set
for model_name, model in best_models.items():
    print(f"Evaluating {model_name} on the test set...")
    y_pred = model.predict(X_test)
    print(f"Classification report for {model_name}:\n{classification_report(y_test, y_pred)}\n")

Column names: Index([      0,       1,       2,       3,       4,       5,       6,       7,
             8,       9,
       ...
           759,     760,     761,     762,     763,     764,     765,     766,
           767, 'Class'],
      dtype='object', length=769)
Tuning LogisticRegression...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for LogisticRegression: {'model__C': 580.8362216819946, 'model__penalty': 'l2', 'model__solver': 'liblinear'}
Best cross-validation score for LogisticRegression: 0.5097

Tuning RandomForestClassifier...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for RandomForestClassifier: {'model__max_depth': 30, 'model__min_samples_leaf': 8, 'model__min_samples_split': 4, 'model__n_estimators': 199}
Best cross-validation score for RandomForestClassifier: 0.5841

Tuning SVC...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for SVC: {'model__C': 20.585494295802448, 'model_



Best parameters for GaussianNB: {}
Best cross-validation score for GaussianNB: 0.5372

Tuning XGBClassifier...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters for XGBClassifier: {'model__learning_rate': 0.13349630192554332, 'model__max_depth': 4, 'model__n_estimators': 71}
Best cross-validation score for XGBClassifier: 0.5484

Tuning LGBMClassifier...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017811 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195821
[LightGBM] [Info] Number of data points in the train set: 1344, number of used features: 768
[LightGBM] [Info] Start training from score -1.094158
[LightGBM] [Info] Start training from score -0.913690
[LightGBM] [Info] Start training from score -1.331288
Best parameters for LGBMClassifier: {'model__learning_rate': 0.10121399684340719, 'model__max_depth': 20, 'model__n_estimators': 100}
Best cross-validation score for LGBMClassifier: 0.5565

Tuning CatBoostClassifier...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for CatBoostClassifier: {'model__depth': 4, 'model__iterations': 98, 'model__learning_rate': 0.11495493205167782}
Best cross-validation score for CatBoostClassifier: 0.5781

Evaluating LogisticRegression on the test set...
Classification report for Lo

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB


# Load the CSV data
file_path = r"/content/training_with_glove embeddings_split.xlsx"
data = pd.read_excel(file_path, engine='openpyxl')

# Define features and target
X = data.drop(columns=['input', 'Class'])
y = data['Class']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models and parameter grids
models = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'model__C': np.logspace(-4, 4, 10),
            'model__penalty': ['l1', 'l2'],
            'model__solver': ['liblinear', 'saga']
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'model__n_estimators': [50, 100, 200],
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4]
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'model__C': np.logspace(-3, 3, 7),
            'model__gamma': ['scale', 'auto'],
            'model__kernel': ['linear', 'rbf']
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'model__n_neighbors': range(3, 15),
            'model__weights': ['uniform', 'distance'],
            'model__metric': ['euclidean', 'manhattan']
        }
    },
    'GradientBoostingClassifier': {
        'model': GradientBoostingClassifier(),
        'params': {
            'model__n_estimators': [50, 100, 200],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__max_depth': [3, 5, 7]
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4]
        }
    },
    'XGBClassifier': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'model__n_estimators': [50, 100, 200],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__max_depth': [3, 5, 7]
        }
    },
    'CatBoostClassifier': {
        'model': CatBoostClassifier(verbose=0),
        'params': {
            'model__iterations': [100, 200, 300],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__depth': [3, 5, 7]
        }
    },
    'LGBMClassifier': {
        'model': LGBMClassifier(),
        'params': {
            'model__n_estimators': [50, 100, 200],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__max_depth': [-1, 10, 20]
        }
    },
    'GaussianNB': {
        'model': GaussianNB(),
        'params': {
            # GaussianNB has no hyperparameters to tune
        }
    }
}

# Perform RandomizedSearchCV for each model
best_models = {}
for model_name, model_info in models.items():
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model_info['model'])
    ])

    if model_info['params']:
        search = RandomizedSearchCV(pipe, model_info['params'], n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
        search.fit(X_train, y_train)
        best_models[model_name] = search.best_estimator_
        print(f"Best parameters for {model_name}: {search.best_params_}")
        print(f"Best cross-validation score for {model_name}: {search.best_score_:.4f}\n")
    else:
        # Directly fit if no hyperparameters to tune
        pipe.fit(X_train, y_train)
        best_models[model_name] = pipe

# Evaluate best models on the test set
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"Classification report for {model_name}:\n{classification_report(y_test, y_pred)}\n")




Best parameters for LogisticRegression: {'model__solver': 'saga', 'model__penalty': 'l2', 'model__C': 0.3593813663804626}
Best cross-validation score for LogisticRegression: 0.5052

Best parameters for RandomForestClassifier: {'model__n_estimators': 50, 'model__min_samples_split': 2, 'model__min_samples_leaf': 4, 'model__max_depth': 10}
Best cross-validation score for RandomForestClassifier: 0.5164

Best parameters for SVC: {'model__kernel': 'rbf', 'model__gamma': 'scale', 'model__C': 1.0}
Best cross-validation score for SVC: 0.5246

Best parameters for KNeighborsClassifier: {'model__weights': 'distance', 'model__n_neighbors': 4, 'model__metric': 'manhattan'}
Best cross-validation score for KNeighborsClassifier: 0.4785

Best parameters for GradientBoostingClassifier: {'model__n_estimators': 50, 'model__max_depth': 5, 'model__learning_rate': 0.2}
Best cross-validation score for GradientBoostingClassifier: 0.5253

Best parameters for DecisionTreeClassifier: {'model__min_samples_split': 5

Parameters: { "use_label_encoder" } are not used.



Best parameters for XGBClassifier: {'model__n_estimators': 100, 'model__max_depth': 5, 'model__learning_rate': 0.1}
Best cross-validation score for XGBClassifier: 0.5119

Best parameters for CatBoostClassifier: {'model__learning_rate': 0.1, 'model__iterations': 200, 'model__depth': 5}
Best cross-validation score for CatBoostClassifier: 0.5238





[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76456
[LightGBM] [Info] Number of data points in the train set: 1344, number of used features: 300
[LightGBM] [Info] Start training from score -1.094158
[LightGBM] [Info] Start training from score -0.913690
[LightGBM] [Info] Start training from score -1.331288
Best parameters for LGBMClassifier: {'model__n_estimators': 100, 'model__max_depth': 10, 'model__learning_rate': 0.1}
Best cross-validation score for LGBMClassifier: 0.5164

Classification report for LogisticRegression:
              precision    recall  f1-score   support

           0       0.60      0.44      0.51       125
           1       0.45      0.65      0.53       124
           2       0.48      0.36      0.41        87

    accuracy                           0.50       336
   macro avg       0.51      0.48      0.48       336
we

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import numpy as np
from scipy.stats import uniform, randint

# Load the Excel data
file_path = r"/content/bert_embeddings (1).xlsx"
data = pd.read_excel(file_path)

# Display column names to confirm structure
print("Column names:", data.columns)

# Define features and target
X = data.drop(columns=['Class'])  # Adjust to the actual feature columns
y = data['Class']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models and parameter distributions for RandomizedSearchCV
models = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'model__C': uniform(1e-4, 1e4),
            'model__penalty': ['l2'],
            'model__solver': ['liblinear']
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': randint(2, 15),
            'model__min_samples_leaf': randint(1, 10)
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'model__C': uniform(1e-3, 1e3),
            'model__gamma': ['scale', 'auto'],
            'model__kernel': ['linear', 'rbf']
        }
    },
    'GradientBoostingClassifier': {
        'model': GradientBoostingClassifier(),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__learning_rate': uniform(0.01, 0.2),
            'model__max_depth': [3, 5, 7]
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'model__n_neighbors': randint(3, 20),
            'model__weights': ['uniform', 'distance'],
            'model__p': [1, 2]
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': randint(2, 15),
            'model__min_samples_leaf': randint(1, 10)
        }
    },
    'GaussianNB': {
        'model': GaussianNB(),
        'params': {}  # No hyperparameters for GaussianNB
    },
    'XGBClassifier': {
        'model': XGBClassifier(eval_metric='logloss', use_label_encoder=False),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__learning_rate': uniform(0.01, 0.2),
            'model__max_depth': randint(3, 10)
        }
    },
    'LGBMClassifier': {
        'model': LGBMClassifier(),
        'params': {
            'model__n_estimators': randint(50, 200),
            'model__learning_rate': uniform(0.01, 0.2),
            'model__max_depth': [-1, 10, 20]
        }
    },
    'CatBoostClassifier': {
        'model': CatBoostClassifier(verbose=0),
        'params': {
            'model__iterations': randint(50, 200),
            'model__learning_rate': uniform(0.01, 0.2),
            'model__depth': randint(4, 10)
        }
    }
}

# Perform RandomizedSearchCV for each model
best_models = {}
for model_name, model_info in models.items():
    print(f"Tuning {model_name}...")
    pipe = Pipeline([
        ('scaler', StandardScaler()),  # Applies scaling
        ('model', model_info['model'])
    ])

    search = RandomizedSearchCV(pipe, model_info['params'], n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, verbose=1, random_state=42)
    search.fit(X_train, y_train)

    best_models[model_name] = search.best_estimator_
    print(f"Best parameters for {model_name}: {search.best_params_}")
    print(f"Best cross-validation score for {model_name}: {search.best_score_:.4f}\n")

# Evaluate best models on the test set
for model_name, model in best_models.items():
    print(f"Evaluating {model_name} on the test set...")
    y_pred = model.predict(X_test)
    print(f"Classification report for {model_name}:\n{classification_report(y_test, y_pred)}\n")

Column names: Index([      0,       1,       2,       3,       4,       5,       6,       7,
             8,       9,
       ...
           759,     760,     761,     762,     763,     764,     765,     766,
           767, 'Class'],
      dtype='object', length=769)
Tuning LogisticRegression...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for LogisticRegression: {'model__C': 1559.9453033620264, 'model__penalty': 'l2', 'model__solver': 'liblinear'}
Best cross-validation score for LogisticRegression: 0.5216

Tuning RandomForestClassifier...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for RandomForestClassifier: {'model__max_depth': None, 'model__min_samples_leaf': 2, 'model__min_samples_split': 9, 'model__n_estimators': 87}
Best cross-validation score for RandomForestClassifier: 0.5484

Tuning SVC...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for SVC: {'model__C': 20.585494295802448, 'mode



Best parameters for GaussianNB: {}
Best cross-validation score for GaussianNB: 0.4881

Tuning XGBClassifier...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters for XGBClassifier: {'model__learning_rate': 0.13349630192554332, 'model__max_depth': 4, 'model__n_estimators': 71}
Best cross-validation score for XGBClassifier: 0.5536

Tuning LGBMClassifier...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011488 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195795
[LightGBM] [Info] Number of data points in the train set: 1344, number of used features: 768
[LightGBM] [Info] Start training from score -1.094158
[LightGBM] [Info] Start training from score -0.913690
[LightGBM] [Info] Start training from score -1.331288
Best parameters for LGBMClassifier: {'model__learning_rate': 0.15639878836228102, 'model__max_depth': -1, 'model__n_estimators': 70}
Best cross-validation score for LGBMClassifier: 0.5670

Tuning CatBoostClassifier...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for CatBoostClassifier: {'model__depth': 7, 'model__iterations': 142, 'model__learning_rate': 0.04668695797323276}
Best cross-validation score for CatBoostClassifier: 0.5536

Evaluating LogisticRegression on the test set...
Classification report for Lo

In [16]:
pip install openpyxl




In [5]:
pip install chardet




In [2]:
pip install catboost


Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [9]:
# Convert the GloVe file to UTF-8 encoding
input_path = "/content/training_with_glove embeddings_split.xlsx"  # Original GloVe file
output_path = "/content/training_with_glove embeddings_split.xlsx"  # Output UTF-8 file

with open(input_path, "r", encoding="latin1") as infile:
    with open(output_path, "w", encoding="utf-8") as outfile:
        for line in infile:
            outfile.write(line)

print(f"File converted and saved to {output_path}")


File converted and saved to /content/training_with_glove embeddings_split.xlsx


In [2]:
!pip install catboost
!pip install dask[dataframe]


Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7
Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.19-py3-none-any.whl.metadata (2.6 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.18-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.16-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.16-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dask-expr