# **Model Building**

## **1.0 import Libraries**

In [1]:
# For Data Manipulation
import pandas as pd
import numpy as np
import pickle
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
# Import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# For Grid Search
from sklearn.model_selection import GridSearchCV

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## **2.0 Load Data**

In [2]:
x_train=pd.read_csv('../Artifacts/x_train.csv').fillna('')
x_test=pd.read_csv('../Artifacts/x_test.csv').fillna('')
y_train=pd.read_csv('../Artifacts/y_train.csv')
y_test=pd.read_csv('../Artifacts/y_test.csv')

In [3]:
# Load vectorizer
vectorizer = pickle.load(open('../Artifacts/vectorizer.pkl', 'rb'))

# Transform text data
x_train = vectorizer.transform(x_train['Tweet'])  # No .toarray()
x_test = vectorizer.transform(x_test['Tweet'])    # No .toarray()


print(x_train.shape, x_test.shape)

(1266952, 333132) (316739, 333132)


In [4]:
y_train=np.array(y_train)
y_test=np.array(y_test)

In [5]:
x_train[0:5]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 42 stored elements and shape (5, 333132)>

## **4.0 Model Building**

In [6]:
models={'Logistic Regression':(LogisticRegression(),{"C": [0.01, 0.1, 1, 10, 100],"penalty": ["l1", "l2"], "solver": ["liblinear"]}),
        'Multinomial Naive Bayes':(MultinomialNB(),{"alpha": [0.01, 0.1, 1, 10],"fit_prior": [True, False]})
        }

best_score=0
for name, (model, params) in models.items():
    search=GridSearchCV(model,params,cv=5,scoring='accuracy',n_jobs=-1)
    search.fit(x_train,y_train)
    y_pred=search.best_estimator_.predict(x_test)
    score=accuracy_score(y_test,y_pred)
    print(f"{name}")
    print(f"Validation Score: {search.best_score_}")
    print(f"accuracy score: {score}")
    print("-----------------")
    if score>best_score:
        best_score=score
        Best_model_name=name
        Best_model = search.best_estimator_
        Best_Params=search.best_params_

print(f"Best Model: {Best_model_name}")
print(f"Best Params: {Best_Params}")

Logistic Regression
Validation Score: 0.7775014367294946
accuracy score: 0.7782274996132462
-----------------
Multinomial Naive Bayes
Validation Score: 0.7620201861379222
accuracy score: 0.7620185704949501
-----------------
Best Model: Logistic Regression
Best Params: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}


In [7]:
# Save the best model in .pkl format
with open(f"../Artifacts/{Best_model_name}.pkl","wb") as model_file:
    pickle.dump(Best_model,model_file)
print(f"Best model saved as {Best_model_name}.pkl")

Best model saved as Logistic Regression.pkl
