# Classification
Classification models help us to segregate data into seprate classes to find out the for example if a email is spam or not

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
import pandas as pd
from sklearn.feature_extraction.text import (
    TfidfVectorizer,
    CountVectorizer,
    HashingVectorizer,
)
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

In [26]:
dataset = pd.read_csv("./IMDB Dataset.csv")
X = dataset["review"].values
y = dataset["sentiment"].values

In [27]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
print(y[:5])

[1 1 1 0 1]


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(f"X-train {len(X_train)}")
print(f"X-test {len(X_test)}")

X-train 40000
X-test 10000


In [56]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 3), min_df=2)
# vectorizer = HashingVectorizer()
# vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [57]:
def eval_res(true, pred):
    cm = confusion_matrix(true, pred)
    print(cm)
    print(f"Accuracy = {accuracy_score(true, pred)}")
    print(f"Precision = {precision_score(true, pred)}")
    print(f"Recall = {recall_score(true, pred)}")
    print(f"F1 = {f1_score(true, pred)}")

## Multinomial Naive Bayes
It deal with the frequency of words for our predictions.\
It assumes that the features are independent to one another

In [58]:
start = datetime.now()
model = MNB()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(f"Total time = {datetime.now() - start}")
eval_res(y_test, pred)

Total time = 0:00:00.076304
[[4431  596]
 [ 497 4476]]
Accuracy = 0.8907
Precision = 0.8824921135646687
Recall = 0.9000603257590991
F1 = 0.8911896465903435


## Support Vector Machine/Classifier(SVC)
SVC use the concept of support vectors to position the seprator between classes. It also has a margin - the distance from the boundary to the closent point to maximise class seperation

In [59]:
start = datetime.now()
model = LinearSVC(C=0.5)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(f"Total time = {datetime.now() - start}")
eval_res(y_test, pred)

Total time = 0:00:00.477187
[[4507  520]
 [ 388 4585]]
Accuracy = 0.9092
Precision = 0.8981390793339863
Recall = 0.9219786848984516
F1 = 0.9099027584838262
