## AML Assignment 3
### Varun Agrawal
### MDS202251
#### train.ipynb

Import and Data Loading

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib

def load_data(train_path, val_path, test_path, mod_df_path):
    train_df = pd.read_csv(train_path)
    val_df = pd.read_csv(val_path)
    test_df = pd.read_csv(test_path)
    df = pd.read_csv(mod_df_path)
    return train_df, val_df, test_df, df


Data Preprocessing

In [11]:
def preprocess_data(train_df, val_df, test_df):
    vectorizer = CountVectorizer()
    vectorizer.fit(train_df.X_train)
    X_train = vectorizer.transform(train_df.X_train)
    X_val = vectorizer.transform(val_df.X_val)
    X_test = vectorizer.transform(test_df.X_test)
    
    tfidf_trans = TfidfTransformer().fit(X_train)
    tfidf_X_train = tfidf_trans.transform(X_train)
    tfidf_X_val = tfidf_trans.transform(X_val)
    tfidf_X_test = tfidf_trans.transform(X_test)
    
    return tfidf_X_train, tfidf_X_val, tfidf_X_test


Model Training and Hyperparameter Tuning:

In [12]:
def train_model(model, X_train, y_train, hyperparams=None):
    if hyperparams:
        model = GridSearchCV(model, hyperparams, cv=5, scoring='accuracy', return_train_score=True)
    model.fit(X_train, y_train)
    return model


Model Evaluation

In [13]:
def evaluate_model(model, X_val, y_val):
    val_predictions = model.predict(X_val)
    accuracy = accuracy_score(y_val, val_predictions)
    report = classification_report(y_val, val_predictions)
    confusion = confusion_matrix(y_val, val_predictions)
    return accuracy, report, confusion


Model Testing

In [14]:
def test_model(model, X_test, y_test):
    test_predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, test_predictions)
    report = classification_report(y_test, test_predictions)
    confusion = confusion_matrix(y_test, test_predictions)
    return accuracy, report, confusion


Loading Data

In [15]:
train_path = '/mnt/d/Applied ML/Assignment 3/data/train.csv'
val_path = '/mnt/d/Applied ML/Assignment 3/data/validation.csv'
test_path = '/mnt/d/Applied ML/Assignment 3/data/test.csv'
mod_df_path = '/mnt/d/Applied ML/Assignment 3/data/modified_df.csv'
train_df, val_df, test_df, _ = load_data(train_path, val_path, test_path, mod_df_path)
tfidf_X_train, tfidf_X_val, tfidf_X_test = preprocess_data(train_df, val_df, test_df)

Model Initialization and Hyperparameter Definitions

In [16]:
# Initialize models
lr_model = LogisticRegression()

# Hyperparameters for models

lr_hyperparams = {'C': [0.01, 0.1, 1.0, 10.0]}


Model Training

In [17]:
lr_model = train_model(lr_model, tfidf_X_train, train_df.y_train, lr_hyperparams)

Model Evaluation on Validation Set

In [18]:
lr_accuracy, lr_report, lr_confusion = evaluate_model(lr_model, tfidf_X_val, val_df.y_val)

Model Scoring on Test Set

In [19]:
lr_test_accuracy, lr_test_report, lr_test_confusion = test_model(lr_model.best_estimator_, tfidf_X_test, test_df.y_test)

Printing Results

In [20]:
# Print best hyperparameters for each model
print("Best Hyperparameters:")
print("Logistic Regression:", lr_model.best_params_)

Best Hyperparameters:
Logistic Regression: {'C': 10.0}


In [21]:
print("\nLogistic Regression Model:")
print("Validation Accuracy:", lr_accuracy)
print("Test Accuracy:", lr_test_accuracy)
print("Classification Report:")
print(lr_report)
print("Confusion Matrix:")
print(lr_confusion)


Logistic Regression Model:
Validation Accuracy: 0.9922480620155039
Test Accuracy: 0.9895287958115183
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       394
           1       1.00      0.97      0.98       122

    accuracy                           0.99       516
   macro avg       0.99      0.98      0.99       516
weighted avg       0.99      0.99      0.99       516

Confusion Matrix:
[[394   0]
 [  4 118]]


**---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

In [24]:
joblib.dump(lr_model.best_estimator_, 'Best_LR.pkl')

['Best_LR.pkl']