In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [3]:
X_train_full = pd.read_csv('train_tfidf_features.csv')
X_test = pd.read_csv('test_tfidf_features.csv')

In [4]:
X_train=X_train_full.drop(columns=['label'])
test_data_ids = X_test['id']

In [5]:
y_train = pd.read_csv('train_tfidf_features.csv')['label'] 

In [None]:
X_train.head()

In [None]:
X_test.head()

# BEST MODEL

###  **Feature Engineering with TF-IDF** is a crucial step in preparing data for machine learning models.

## N-Grams (Unigrams and Bigrams):

To improve our model's F1 score, we're using TF-IDF (Term Frequency-Inverse Document Frequency) with both unigrams and bigrams. Here's why:

Unigrams (single words) help us capture individual terms that might indicate hate speech, like slurs or offensive words.
Bigrams (two-word combinations) are crucial for context. For example:

"No Muslims" could be a hateful phrase
But "No" and "Muslims" separately might not indicate hate



By using both, we catch nuances that single words miss. For instance, "not bad" has a different meaning than "not" and "bad" separately.
We're limiting features (max_features) to focus on the most relevant terms and phrases. This helps prevent overfitting and reduces processing time.
Parallel processing speeds up our TF-IDF calculations, which is helpful for our large dataset.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import parallel_backend

def load_data(file_path):
    return pd.read_csv(file_path)

def fit_transform_in_parallel(vectorizer, data):
    with parallel_backend('multiprocessing'):
        return vectorizer.fit_transform(data)

def transform_in_parallel(vectorizer, data):
    with parallel_backend('multiprocessing'):
        return vectorizer.transform(data)

def process_data(df, vectorizer, is_train=True):
    if is_train:
        X = fit_transform_in_parallel(vectorizer, df['post'])
    else:
        X = transform_in_parallel(vectorizer, df['post'])
    
    feature_names = vectorizer.get_feature_names_out()
    new_df = pd.DataFrame(X.toarray(), columns=feature_names)
    new_df['id'] = df['id']
    
    if is_train:
        new_df['label'] = df['label']
        cols = ['id', 'label'] + [col for col in new_df.columns if col not in ['id', 'label']]
    else:
        cols = ['id'] + [col for col in new_df.columns if col != 'id']
    
    new_df = new_df[cols]
    return new_df, len(feature_names)

# Read the CSV files
print("Loading data...")
df_train = load_data('./50-007-machine-learning-summer-2024/train.csv')
df_test = load_data('./50-007-machine-learning-summer-2024/test.csv')
print("Data loaded.")

# Create TF-IDF Vectorizer with unigrams, bigrams, and trigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features= 70000)  # Adjust max_features as needed

# Process training data
print("Processing training data...")
new_df_train, n_features = process_data(df_train, vectorizer, is_train=True)
print(f"Number of features: {n_features}")

# Save training data to a new Parquet file
new_df_train.to_parquet('new_train_tfidf_features_with_ngrams.parquet', index=False)
print("Training data saved.")

# Process test data
print("Processing test data...")
new_df_test, _ = process_data(df_test, vectorizer, is_train=False)

# Save test data to a new Parquet file
new_df_test.to_parquet('new_test_tfidf_features_with_ngrams.parquet', index=False)
print("Test data saved.")

print("Processing complete.")

# Load the processed data
X_train_full = pd.read_parquet('new_train_tfidf_features_with_ngrams.parquet')
X_test = pd.read_parquet('new_test_tfidf_features_with_ngrams.parquet')

# Extract labels and features for training data
y_train = X_train_full['label']
X_train = X_train_full.drop(columns=['label', 'id'])

# Extract IDs for test data
test_data_ids = X_test['id']
X_test = X_test.drop(columns=['id'])

print("Data prepared for modeling:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")


### Hate Speech Classification with Stacked Models and Dask

We're using a stacked model approach, combining CatBoost, Logistic Regression, 
and Decision Tree classifiers. This ensemble method aims to capture different 
patterns in the data, potentially improving our hate speech detection accuracy.

We're using Dask instead of standard pandas DataFrames due to memory constraints. 
Our feature set is large (about 70,000 features), which exceeds our available RAM. 
Dask allows us to process this data in chunks, enabling us to work with a rich 
feature set that would otherwise be impossible with our limited memory resources.

This approach balances the need for complex models and rich features with the 
practical limitations of our hardware.


In [None]:
import dask.dataframe as dd
from sklearn.ensemble import StackingClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import logging
import time
import pandas as pd

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Load the data using Dask
logging.info("Loading data...")
dask_df_train = dd.read_parquet('new_train_tfidf_features_with_ngrams.parquet')
logging.info("Loading data...2")
dask_df_test = dd.read_parquet('new_test_tfidf_features_with_ngrams.parquet')
logging.info("Done loading...")
# Convert to pandas DataFrame (in chunks)
X_train_full = dask_df_train.compute()
X_test_full = dask_df_test.compute()

y_train = X_train_full['label']
X_train = X_train_full.drop(columns=['label'])

test_data_ids = X_test_full['id'].copy()  # Make a copy of the id column

# Remove 'id' column from both X_train and X_test if present
if 'id' in X_train.columns:
    X_train = X_train.drop(columns=['id'])
X_test = X_test_full.drop(columns=['id'])

# Ensure both X_train and X_test have the same columns
common_columns = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_columns]
X_test = X_test[common_columns]

logging.info(f"Shape of X_train: {X_train.shape}")
logging.info(f"Shape of X_test: {X_test.shape}")

# Define the base models
catboost_model = CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, verbose=0, thread_count=-1, allow_writing_files=True)
log_reg_model = LogisticRegression(solver='liblinear', max_iter=100, n_jobs=-1)

# Define the stacking classifier
stacking_model = StackingClassifier(
    estimators=[('catboost', catboost_model), ('log_reg', log_reg_model)],
    final_estimator=DecisionTreeClassifier(max_depth=5),
    cv=5,
    n_jobs=-1
)

# Train the model
logging.info("Training the stacked model...")
start_time = time.time()
stacking_model.fit(X_train, y_train)
end_time = time.time()
logging.info(f"Model training completed in {end_time - start_time:.2f} seconds")

# Make predictions
logging.info("Making predictions...")
y_pred = stacking_model.predict(X_test)

# Create and save the submission file
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_catboostlrestimator.csv', index=False)
logging.info("Submission file created: submission_catboostlrestimator.csv")

logging.info("Process completed successfully.")


## Other Iterations

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

> Naive Bayes uses conditional probability to classify data

## Gaussian NB
Start with using Gaussian NB for classification where
uses Gaussian features


In [None]:
nb_model=GaussianNB()
nb_model=nb_model.fit(X_train,y_train)

In [None]:
y_pred=nb_model.predict(X_test)

In [None]:
y_pred

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_nb.csv', index=False)

To try more variations, adjust the paramter of var_smoothing to stabilise model with small variance


grid search will be used to optimise hyperparameter of var_smoothing to ensure stability
grid search takes in the set of parameters and using cross-validation, as dataset is large, will use folds of 5 to reduce computational load

In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}

In [None]:
nb_model=GaussianNB()
grid_search = GridSearchCV(nb_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
print(f"Best var_smoothing: {grid_search.best_params_['var_smoothing']}")

same as default value in sci kit, hence no need for changes

### Multinomial Naive Bayes
Multinomial Naive Bayes will classify using multinomial distribution(assumes that features are frequency of occurences -> generalises binomial distribution(probability of outcomes from fixed number of trials)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
mnb_model=MultinomialNB()
mnb_model.fit(X_train, y_train)

In [None]:
y_pred=mnb_model.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_mnb.csv', index=False)

hyperparameter of multinomial naive bayes is alpha: laplace smoothing to avoid 0 probability which adjusts probability estimates by adding a smoothing parameter

grid search will be used to find the optimal value again

In [None]:
mnb_model=MultinomialNB()
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]
}
grid_search = GridSearchCV(estimator=mnb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

In [None]:
print(f"best alpha: {grid_search.best_params_['alpha']}")

Retry with new alpha

In [None]:
mnb_model=MultinomialNB(alpha=0.5)
mnb_model.fit(X_train, y_train)

In [None]:
y_pred=mnb_model.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_mnbw0.5.csv', index=False)

### Complement Naive Bayes

Uses laplace smoothing as well. suitable for imbalanced data sets as it considered the P(complement) of each class, start with using grid search to find optimal alpha

In [None]:
from sklearn.naive_bayes import ComplementNB

In [None]:
cnb_model=ComplementNB()
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]
}
grid_search = GridSearchCV(estimator=cnb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

In [None]:
print(f"best alpha: {grid_search.best_params_['alpha']}")

retry w new alpha

In [None]:
cnb_model=ComplementNB(alpha=0.5)
cnb_model.fit(X_train, y_train)

In [None]:
y_pred=cnb_model.predict(X_test)
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_cnbw0.5.csv', index=False)

In [None]:
cnb_model=ComplementNB(alpha=0.45)
cnb_model.fit(X_train, y_train)

In [None]:
y_pred=cnb_model.predict(X_test)
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_cnbw0.45.csv', index=False)


cnb with diff alphas

In [None]:
cnb_model=ComplementNB()
param_grid = {
    'alpha': [ 0.65, 0.7, 0.75, 0.8, 0.83, 0.85]
}
grid_search = GridSearchCV(estimator=cnb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

In [None]:
print(f"best alpha: {grid_search.best_params_['alpha']}")

In [None]:
cnb_model=ComplementNB(alpha=0.6)
cnb_model.fit(X_train, y_train)

In [None]:
y_pred=cnb_model.predict(X_test)
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_cnbw0.8.csv', index=False)

When comparing with the previous models tested, complement nb is leading to a higher score, this may indicate that there is a class imbalance in the data set, hence new synthetic sampels using SMOTE which is the synthetic minority oversampling technique which will generate new samples by interpolating minority class samples as compared to oversampling which duplicates the existing ones

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE()

In [None]:
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
cnb_model=ComplementNB()
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]
}
grid_search = GridSearchCV(estimator=cnb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_resampled, y_resampled)

In [None]:
print(f"best alpha: {grid_search.best_params_['alpha']}")

In [None]:
cnb_model=ComplementNB(alpha=0.1)
cnb_model.fit(X_resampled, y_resampled)

In [None]:
y_pred=cnb_model.predict(X_test)
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_cnbwSMOLE.csv', index=False)

## SVM

### SVC
C-support vector classification

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

In [None]:
scaler = StandardScaler()
svc_model = SVC(kernel='linear', max_iter=10000)
svc_model=svc_model.fit(X_train,y_train)

In [None]:
y_pred=svm_model.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_svm.csv', index=False)

Parameters of SVC: Kernel: transforms input data to required form
Regularisation: control the margin of hyperplane
Gamma: choosing of points to consider

Optimise with grid search again

In [None]:
svm_model=SVC()
param_grid = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

In [None]:
random_search = RandomizedSearchCV(svm_model, param_grid, cv=5, scoring='accuracy')
random_search.fit(X_train, y_train)
print("best parameters:", grid_search.best_params_)

SVC with optimised parameters

In [None]:
svc_model=SVC(kernel='linear')
svc_model=svc_model.fit(X_train,y_train)
y_pred=svm_model.predict(X_test)
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_svm_optimised.csv', index=False)

In [None]:
!conda update --all



In [None]:
conda install --force-reinstall anaconda-project conda-build jupyterlab


### NuSVC
same as svc but has paramter to control number of support vectors(nu)

In [None]:
param_grid = {
    'nu': [0.1, 0.2, 0.5, 0.8, 0.9],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4, 5], 
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10],
    'coef0': [0.0, 0.1, 1, 10], 
    'shrinking': [True, False],
    'class_weight': [None, 'balanced']
}


In [None]:
nusvc_model = NuSVC()

In [None]:
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

In [None]:
print("best parameters:", grid_search.best_params_)

## Decision Tree


In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier()
decision_tree_model = decision_tree_model.fit(X_train, y_train)

In [None]:
y_pred=decision_tree_model.predict(X_test)
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_dt.csv', index=False)

In [None]:
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': [None, 'sqrt', 'log2']
}

random_search = RandomizedSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("best parameters:", grid_search.best_params_)

In [None]:
decision_tree_model = DecisionTreeClassifier(max_depth=10)
decision_tree_model = decision_tree_model.fit(X_train, y_train)

In [None]:
y_pred=decision_tree_model.predict(X_test)
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_dtoptimised.csv', index=False)

In [None]:
!pip install \
    --extra-index-url=https://pypi.nvidia.com \
    cudf-cu12==24.6.* dask-cudf-cu12==24.6.* cuml-cu12==24.6.* \
    cugraph-cu12==24.6.* cuspatial-cu12==24.6.* cuproj-cu12==24.6.* \
    cuxfilter-cu12==24.6.* cucim-cu12==24.6.* pylibraft-cu12==24.6.* \
    raft-dask-cu12==24.6.* cuvs-cu12==24.6.*

## Logistics Regression

In [21]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)

In [None]:
y_pred = log_reg_model.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_lr.csv', index=False)

Hyperparameters for logistics regression: c: regularisation strength, penalty: l1/l2 penalty where l2 punishes outliers more, solver: algo for optimisation, eg. liblinera, newton-cg, bfgs, sag, saga, max_iter: max number of iterations for convergence, class-weight: adjusts weight for imbalanced data

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = {
    'C': [0.05, 0.1, 0.15, 0.2],
    'penalty': ['l2'],
    'solver': ['liblinear'],
    'max_iter': [100],
    'class_weight': ['balanced']
}

In [None]:
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f'best parameters: {grid_search.best_params_}')

In [None]:
log_reg_model = LogisticRegression(C=0.2,
    class_weight='balanced',
    max_iter=100,
    penalty='l2',
    solver='liblinear'
)
log_reg_model.fit(X_train, y_train)
y_pred = log_reg_model.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_lroptimised0.2.csv', index=False)

In [22]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = {
    'C': [0.4],
    'penalty': ['l1','l2'],
    'solver': ['liblinear'],
    'max_iter': [75,100,125],
    'class_weight': ['balanced']
}

In [24]:
log_reg_model = LogisticRegression()
random_search = RandomizedSearchCV(
    estimator=log_reg_model,
    param_distributions=param_grid,
    n_iter=10,  # Number of different combinations to try
    cv=5,       # Number of cross-validation folds
    verbose=1,  # Level of verbosity
    random_state=42,  # For reproducibility
    n_jobs=-1  # Use all available cores
)
random_search.fit(X_train, y_train)
print("Best Parameters:", random_search.best_params_)



Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 75, 'class_weight': 'balanced', 'C': 0.4}


In [27]:
log_reg_model = LogisticRegression(
    solver='liblinear',
    penalty='l2',
    max_iter=75,
    class_weight='balanced',
    C=0.4
)
log_reg_model.fit(X_train, y_train)
y_pred = log_reg_model.predict(X_test)

In [28]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_lroptimised0.4_2.csv', index=False)

## Stochastic Gradient Descent Classification

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
sgd_model=SGDClassifier()
sgd_model.fit(X_train,y_train)
y_pred = sgd_model.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_sgd.csv', index=False)

In [None]:
param_grid = {
    'loss': ['log', 'modified_huber'],
    'alpha': [0.0001, 0.001],
    'penalty': ['l2', 'l1', 'elasticnet']
}

In [None]:
sgd = SGDClassifier()
random_search = RandomizedSearchCV(estimator=sgd, param_distributions=param_grid, n_iter=10, cv=5, n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

In [None]:
print(f'best parameters: {grid_search.best_params_}')

In [None]:
sgd_model=SGDClassifier()
sgd_model.fit(X_train,y_train)
y_pred = sgd_model.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_sgdoptimised.csv', index=False)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV


In [None]:
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)

In [None]:
y_pred = random_forest_model.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_rf.csv', index=False)

In [None]:
from scipy.stats import randint


param_grid = {
    'n_estimators': [50, 100, 200, 300],          
    'max_features': ['auto', 'sqrt', 'log2'],   
    'max_depth': [None, 10, 20, 30, 40, 50],   
    'min_samples_split': [2, 5, 10],           
    'min_samples_leaf': [1, 2, 4],              
    'bootstrap': [False]                  
}



In [None]:
random_forest_model = RandomForestClassifier()
random_forest_random = RandomizedSearchCV(estimator = random_forest_model, param_distributions = param_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
random_forest_random.fit(X_train, y_train)
print("best parameters:", random_search.best_params_)

In [None]:
random_forest_model.fit(X_train, y_train)
y_pred = random_forest_model.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_rfotimised.csv', index=False)

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
y_pred= gbc.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_gb.csv', index=False)

LightGBM

In [None]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

In [None]:
y_pred=clf.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_lightgb.csv', index=False)

optimise by:
inc max bin
slower learning rate
inc number of iterations
inc number of leaves(but may lead to overfitting)
DART

In [None]:
params = {
    'boosting_type': 'dart',  # Using DART
    'learning_rate': 0.01,    # Small learning rate
    'num_iterations': 1000,   # Large number of iterations
    'num_leaves': 63,         # Large number of leaves
    'max_bin': 255}          # Large max_bin

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)

In [None]:
model = lgb.train(params, train_data)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_lightgboptimised.csv', index=False)

### XGboost

In [None]:
from sklearn.utils import resample
from xgboost import XGBClassifier

In [None]:
model = XGBClassifier(eval_metric='mlogloss')
model.fit(X_train, y_train)
predictions_proba = model.predict_proba(X_test)[:, 1]
predictions_binary = (predictions_proba > 0.5).astype(int)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': final_predictions_binary})
submission.to_csv('submission_xgboost.csv', index=False)

In [None]:
n_bags = 5
bagged_predictions = np.zeros((X_test.shape[0], n_bags))

In [None]:
for i in range(n_bags):
    X_resampled, y_resampled = resample(X_train, y_train, random_state=i)
    
    model = XGBClassifier(eval_metric='mlogloss')
    
    model.fit(X_resampled, y_resampled)
    
    bagged_predictions[:, i] = model.predict_proba(X_test)[:, 1]

In [None]:
final_predictions = np.mean(bagged_predictions, axis=1)
final_predictions_binary = (final_predictions > 0.5).astype(int)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': final_predictions_binary})
submission.to_csv('submission_xgboostbagged.csv', index=False)

In [None]:
xgb_model = XGBClassifier(eval_metric='mlogloss')
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 6],
    'min_child_weight': [1, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1],
    'lambda': [1, 5],
    'alpha': [0, 0.1]
}

In [None]:
random_search = RandomizedSearchCV(
    xgb_model,
    param_grid,
    cv=5,
    verbose=1,
    n_jobs=-1
)
random_search.fit(X_train, y_train)
print("best parameters:", random_search.best_params_)

In [None]:
xgb_model = XGBClassifier(
    subsample=1.0,
    n_estimators=200,
    min_child_weight=5,
    max_depth=6,
    learning_rate=0.1,
    reg_lambda=5,                 
    gamma=0.1,
    colsample_bytree=0.8,
    alpha=0.1,
    eval_metric='mlogloss'     
)

xgb_model.fit(X_train,y_train)
y_pred=xgb_model.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_xgboostoptimised.csv', index=False)

### Cat Boost

In [6]:
from catboost import CatBoostClassifier

In [None]:
catboost=CatBoostClassifier()
catboost.fit(X_train,y_train)
y_pred=catboost.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': final_predictions_binary})
submission.to_csv('submission_catboost.csv', index=False)

In [7]:
# Reduced parameter grid
param_grid = {
    'iterations': [500, 750],  # Fewer iterations
    'learning_rate': [0.01, 0.1],  # Fewer learning rates
    'depth': [6, 7],  # Fewer depths
    'l2_leaf_reg': [3, 7],  # Fewer regularization values
    'subsample': [0.8, 1.0],  # Fewer subsample values
    'bagging_temperature': [0, 1.0],  # Fewer values for bagging temperature
    'border_count': [50, 100]  # Fewer border counts
}


In [None]:
catboost=CatBoostClassifier()
random_search = RandomizedSearchCV(
    catboost,
    param_distributions=param_grid,
    n_iter=20,
    cv=5,
    verbose=1,
    n_jobs=-1
)
random_search.fit(X_train, y_train)
print("best parameters:", random_search.best_params_)

In [8]:
# Reduced parameter grid
param_grid = {
    'iterations': [750,1000],  # Fewer iterations
    'learning_rate': [0.07, 0.1,1.3],  # Fewer learning rates
    'depth': [7,8],  # Fewer depths
    'l2_leaf_reg': [6, 7,8],  # Fewer regularization values
    'subsample': [0.5,0.8],  # Fewer subsample values
    'bagging_temperature': [0, 0.5],  # Fewer values for bagging temperature
    'border_count': [25, 50, 75]  # Fewer border counts
}


In [9]:
catboost=CatBoostClassifier()
random_search = RandomizedSearchCV(
    catboost,
    param_distributions=param_grid,
    n_iter=20,
    cv=5,
    verbose=1,
    n_jobs=-1
)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
0:	learn: 0.6856162	total: 212ms	remaining: 3m 31s
1:	learn: 0.6777202	total: 283ms	remaining: 2m 21s
2:	learn: 0.6721506	total: 347ms	remaining: 1m 55s
3:	learn: 0.6674339	total: 410ms	remaining: 1m 42s
4:	learn: 0.6621623	total: 471ms	remaining: 1m 33s
5:	learn: 0.6579746	total: 533ms	remaining: 1m 28s
6:	learn: 0.6538511	total: 604ms	remaining: 1m 25s
7:	learn: 0.6500967	total: 665ms	remaining: 1m 22s
8:	learn: 0.6475778	total: 729ms	remaining: 1m 20s
9:	learn: 0.6449219	total: 790ms	remaining: 1m 18s
10:	learn: 0.6423591	total: 850ms	remaining: 1m 16s
11:	learn: 0.6394914	total: 907ms	remaining: 1m 14s
12:	learn: 0.6377549	total: 965ms	remaining: 1m 13s
13:	learn: 0.6360028	total: 1.02s	remaining: 1m 12s
14:	learn: 0.6340507	total: 1.08s	remaining: 1m 11s
15:	learn: 0.6318020	total: 1.15s	remaining: 1m 10s
16:	learn: 0.6303070	total: 1.21s	remaining: 1m 9s
17:	learn: 0.6282124	total: 1.27s	remaining: 1m 9s
18:	learn: 0.6

In [10]:
print("best parameters:", random_search.best_params_)

best parameters: {'subsample': 0.8, 'learning_rate': 0.07, 'l2_leaf_reg': 7, 'iterations': 1000, 'depth': 7, 'border_count': 25, 'bagging_temperature': 0.5}


In [45]:
catboost_model = CatBoostClassifier(
    iterations=1000, 
    learning_rate=0.1, 
    depth=7, 
    l2_leaf_reg=7, 
    border_count=50, 
    bagging_temperature=1.0, 
    subsample=1.0, 
    verbose=0
)

In [46]:
catboost_model.fit(X_train,y_train)
y_pred=catboost_model.predict(X_test)

In [47]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_catboost0.1_2.csv', index=False)

In [33]:
catboost_model = CatBoostClassifier(
    iterations=750, 
    learning_rate=0.1, 
    depth=7, 
    l2_leaf_reg=3, 
    border_count=50, 
    bagging_temperature=0, 
    subsample=1.0, 
    verbose=0
)


In [34]:
catboost.fit(X_train,y_train)
y_pred=catboost.predict(X_test)

Learning rate set to 0.0347
0:	learn: 0.6897648	total: 50.3ms	remaining: 50.3s
1:	learn: 0.6858047	total: 97.4ms	remaining: 48.6s
2:	learn: 0.6824893	total: 141ms	remaining: 46.9s
3:	learn: 0.6795999	total: 186ms	remaining: 46.3s
4:	learn: 0.6764532	total: 231ms	remaining: 46s
5:	learn: 0.6733183	total: 275ms	remaining: 45.6s
6:	learn: 0.6710775	total: 320ms	remaining: 45.4s
7:	learn: 0.6684965	total: 364ms	remaining: 45.2s
8:	learn: 0.6657625	total: 408ms	remaining: 45s
9:	learn: 0.6632184	total: 458ms	remaining: 45.4s
10:	learn: 0.6612863	total: 504ms	remaining: 45.3s
11:	learn: 0.6591142	total: 551ms	remaining: 45.4s
12:	learn: 0.6568444	total: 597ms	remaining: 45.3s
13:	learn: 0.6550989	total: 645ms	remaining: 45.4s
14:	learn: 0.6531684	total: 692ms	remaining: 45.5s
15:	learn: 0.6514039	total: 738ms	remaining: 45.4s
16:	learn: 0.6501234	total: 792ms	remaining: 45.8s
17:	learn: 0.6486257	total: 836ms	remaining: 45.6s
18:	learn: 0.6470534	total: 881ms	remaining: 45.5s
19:	learn: 0.64

In [35]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_catboost0.15_5.csv', index=False)

## Cat and LGB

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_model=KNeighborsClassifier()
knn_model.fit(X_train, y_train)

In [None]:
y_pred=knn_model.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_knn.csv', index=False)

In [None]:
param_grid = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2],
    'leaf_size': [20, 30, 40]
}

In [None]:
grid_search = GridSearchCV(estimator=knn_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

In [None]:
print(f'best parameters: {grid_search.best_params_}')

In [None]:
knn_model=KNeighborsClassifier()
knn_model.fit(X_train, y_train)

## Ridge Classifier

In [None]:
from sklearn.linear_model import RidgeClassifier

In [None]:
ridge_model= RidgeClassifier()
ridge_model.fit(X_train, y_train)

In [None]:
y_pred=ridge_model.predict(X_test)

In [None]:
param_grid = {
    'alpha': [0.1, 1.0, 10.0, 100.0],
}

In [None]:
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

In [None]:
print(f'best parameters: {grid_search.best_params_}')

In [None]:
ridge_model= RidgeClassifier()
ridge_model.fit(X_train, y_train)

## AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adaboost=AdaBoostClassifier()
adaboost.fit(X_train, y_train)

In [None]:
y_pred=adaboost.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_adaboost.csv', index=False)

In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 1],
    'estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=3)]
}

In [None]:
adaboost=AdaBoostClassifier()
random_search = RandomizedSearchCV(estimator=adaboost, param_distributions=param_grid, n_iter=20, cv=5, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

In [None]:
print(f'best parameters: {grid_search.best_params_}')

## Extra Tree Classifier

In [None]:
from sklearn.tree import ExtraTreeClassifier

In [None]:
ExtraTreeClassifier = ExtraTreeClassifier()
ExtraTreeClassifier.fit(X_train, y_train)

In [None]:
y_pred=ExtraTreeClassifier.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_exttree.csv', index=False)

## Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
extra_trees = ExtraTreesClassifier(n_estimators=10, random_state=42)
extra_trees.fit(X_train, y_train)

In [None]:
y_pred=extra_trees.predict(X_test)
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_exttrees.csv', index=False)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False]
}

In [None]:
ExtraTreeClassifier = ExtraTreesClassifier()
grid_search = GridSearchCV(estimator=ExtraTreeClassifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
print(f'best parameters: {grid_search.best_params_}')

In [None]:
ExtraTreeClassifier = ExtraTreesClassifier()
ExtraTreeClassifier.fit(X_train, y_train)
y_pred=ExtraTreeClassifier.predict(X_test)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_exttreesoptimised.csv', index=False)

### Stacking


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

In [None]:
def get_models():
 models = dict()
 models['rf'] = RandomForestClassifier()
 models['cnb'] = ComplementNB(alpha=0.5)
 models['lr'] = LogisticRegression(C=0.1,
    class_weight='balanced',
    max_iter=100,
    penalty='l2',
    solver='liblinear'
)
 models['lightgb'] = lgb.LGBMClassifier()
 return models

In [None]:
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from matplotlib import pyplot

In [None]:
models = get_models()
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

In [None]:
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.title('Model Performance Comparison')
pyplot.ylabel('Accuracy')
pyplot.show()

rf, lightgb and lr does better on avg

In [None]:
def get_stacking():
 # define the base models
 level0 = list()
 level0.append(('rf', RandomForestClassifier()))
 level0.append(('cnb', ComplementNB(alpha=0.5)))
 level0.append(('lr', LogisticRegression(C=0.1,
    class_weight='balanced',
    max_iter=100,
    penalty='l2',
    solver='liblinear')))
 level0.append(('lightgb', lgb.LGBMClassifier()))
 # define meta learner model
 level1 = LogisticRegression()
 # define the stacking ensemble
 model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
 return model

In [None]:
def get_models():
 models = dict()
 models['rf'] = RandomForestClassifier()
 models['cnb'] = ComplementNB(alpha=0.5)
 models['lr'] = LogisticRegression(C=0.1,
    class_weight='balanced',
    max_iter=100,
    penalty='l2',
    solver='liblinear'
)
 models['lightgb'] = lgb.LGBMClassifier()
 models['stacking']=get_stacking()
 return models

In [None]:
def evaluate_model(model, X, y):
 cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
 scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
 return scores

In [None]:
models = get_models()
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

In [None]:
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.title('Model Performance Comparison')
pyplot.ylabel('Accuracy')
pyplot.show()

## NB-SVM


In [None]:
from sklearn.naive_bayes import ComplementNB


In [None]:
cnb = ComplementNB(alpha=0.5)
cnb.fit(X_train, y_train)
prob_train = cnb.predict_proba(X_train)
prob_test = cnb.predict_proba(X_test)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
prob_train_scaled = scaler.fit_transform(prob_train)
prob_test_scaled = scaler.transform(prob_test)

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel='linear') 
svm.fit(prob_train_scaled, y_train)

In [None]:
from sklearn.metrics import accuracy_score
y_pred = svm.predict(prob_test_scaled)

In [None]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred})
submission.to_csv('submission_nb-svm.csv', index=False)

## LR + cnb

## lr +cnb +catboost

In [None]:

log_reg_model = LogisticRegression(
    solver='liblinear',
    penalty='l2',
    max_iter=75,
    class_weight='balanced',
    C=0.4
)

catboost_model = CatBoostClassifier(
    iterations=750, 
    learning_rate=0.1, 
    depth=7, 
    l2_leaf_reg=3, 
    border_count=50, 
    bagging_temperature=0, 
    subsample=1.0, 
    verbose=0
)
cnb_model=ComplementNB(alpha=0.5)


In [52]:
from sklearn.ensemble import StackingClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Define the base models
log_reg_model = LogisticRegression(
    solver='liblinear',
    penalty='l2',
    max_iter=75,
    class_weight='balanced',
    C=0.4
)

catboost_model = CatBoostClassifier(
    iterations=750, 
    learning_rate=0.1, 
    depth=7, 
    l2_leaf_reg=3, 
    border_count=50, 
    bagging_temperature=0, 
    subsample=1.0, 
    verbose=0
)

cnb_model = ComplementNB(alpha=0.5)

# Define the stacking classifiers
stacking_log_reg = StackingClassifier(
    estimators=[
        ('catboost', catboost_model),
        ('log_reg', log_reg_model),
        ('cnb', cnb_model)
    ],
    final_estimator=LogisticRegression(),
    cv=5
)

stacking_decision_tree = StackingClassifier(
    estimators=[
        ('catboost', catboost_model),
        ('log_reg', log_reg_model),
        ('cnb', cnb_model)
    ],
    final_estimator=DecisionTreeClassifier(max_depth=5),
    cv=5
)

stacking_gbm = StackingClassifier(
    estimators=[
        ('catboost', catboost_model),
        ('log_reg', log_reg_model),
        ('cnb', cnb_model)
    ],
    final_estimator=GradientBoostingClassifier(n_estimators=100),
    cv=5
)

# Train and evaluate models
stacking_log_reg.fit(X_train, y_train)
stacking_decision_tree.fit(X_train, y_train)
stacking_gbm.fit(X_train, y_train)

y_pred_log_reg = stacking_log_reg.predict(X_test)
y_pred_decision_tree = stacking_decision_tree.predict(X_test)
y_pred_gbm = stacking_gbm.predict(X_test)

In [53]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred_gbm})
submission.to_csv('submission_lr-cnb-catboost_gradboostclassifier.csv', index=False)
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred_log_reg})
submission.to_csv('submission_lr-cnb-catboost_lrestimator.csv', index=False)
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred_decision_tree})
submission.to_csv('submission_lr-cnb-catboostdtestimator.csv', index=False)

## catboost + lr gradboost estimator

In [54]:
from sklearn.ensemble import StackingClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Define the base models
log_reg_model = LogisticRegression(
    solver='liblinear',
    penalty='l2',
    max_iter=75,
    class_weight='balanced',
    C=0.4
)

catboost_model = CatBoostClassifier(
    iterations=750, 
    learning_rate=0.1, 
    depth=7, 
    l2_leaf_reg=3, 
    border_count=50, 
    bagging_temperature=0, 
    subsample=1.0, 
    verbose=0
)

In [55]:
# Define the stacking classifiers
stacking_log_reg = StackingClassifier(
    estimators=[
        ('catboost', catboost_model),
        ('log_reg', log_reg_model),
        ('cnb', cnb_model)
    ],
    final_estimator=LogisticRegression(),
    cv=5
)

In [56]:
submission = pd.DataFrame({'Id': test_data_ids, 'Label': y_pred_gbm})
submission.to_csv('submission_lr-catboost_gradboostclassifier.csv', index=False)