In [1]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

# Importing all required libraries
import pandas as pd
import numpy as np
import pickle
from typing import List
import random
import io
from statistics import mean

from sklearn.model_selection import (
    train_test_split,
    KFold,
    cross_val_score,
    GridSearchCV,
)
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
)
from sklearn.pipeline import Pipeline

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# Fetching stopwords
nltk.download("stopwords")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [41]:

# Reading in data
# df = pd.read_csv(io.StringIO(uploaded["CLEANED_FOR_REAL.csv"].decode("latin-1")))
# df = pd.read_csv("/content/CLEANED_FOR_REAL.csv", encoding="latin-1")
df = pd.read_csv("/content/Synthetic_from_NB.csv", encoding="latin-1")
df.columns=['label','text']
df.head()


Unnamed: 0,label,text
0,1,seven hours one get julyyyyy exitedd breast la...
1,1,higher mine christopher ugghhh fishing otherwi...
2,1,havegood night dock meetings scores slid canwa...
3,1,notts hadheadache days today hope whoop whoop ...
4,0,test thank wheres herecome sometime soon world...


In [42]:

# Checking attributes in the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   2000 non-null   int64 
 1   text    2000 non-null   object
dtypes: int64(1), object(1)
memory usage: 31.4+ KB


In [43]:


# Drop nulls as they are only 0.4%
df.dropna(axis=0, inplace=True)

# df["text"] = df["text"].str.lower()
x = df["text"]
y = df["label"]

# Splitting into 75-25 train-test
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.0001, random_state=42
)

token = RegexpTokenizer(r"[a-zA-Z0-9]+")

# Encoding as unicode as there are some special chars in tweets
x_train = x_train.values.astype("U")

# Building the pipeline for NB
classifier = Pipeline(
    [
        ("tfidf", TfidfVectorizer(tokenizer=token.tokenize)),
        ("clf", MultinomialNB()),
    ]
)

tuning_parameters = {
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "tfidf__use_idf": (True, False),
    "tfidf__smooth_idf": (True, False),
    "tfidf__norm": ("l1", "l2"),
    "tfidf__max_features": (10000, 25000),
    "clf__alpha": [0.1, 0.5, 1, 1.5, 2],
}

In [44]:
"""Using Optimal model from Grid Search"""
vec = TfidfVectorizer(
    ngram_range=(1, 2),
    use_idf=False,
    smooth_idf=True,
    max_features=25000,
    norm="l1",
    tokenizer=token.tokenize,
)
x_train_tfidf = vec.fit_transform(x_train)
clf = MultinomialNB(alpha=0.5).fit(x_train_tfidf, y_train)

x_test_tfidf = vec.transform(x_test)
predictions = clf.predict(x_test_tfidf)
predictions_prob = clf.predict_log_proba(x_test_tfidf)

k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
x_tfidf = vec.fit_transform(x.values.astype("U"))
print(
    "Cross Validation score:",
    cross_val_score(clf, x_tfidf, y, cv=k_fold, n_jobs=-1, error_score="raise"),
)

'Using Optimal model from Grid Search'

Cross Validation score: [0.59  0.52  0.655 0.6   0.67  0.63  0.575 0.635 0.555 0.605]


In [45]:


print("Train accuracy ={:.2f}%".format(clf.score(x_train_tfidf, y_train) * 100))
print("Test accuracy ={:.2f}%".format(clf.score(x_test_tfidf, y_test) * 100))

print("Precision score: ", precision_score(y_test, predictions, average="weighted"))
print("Recall score: ", recall_score(y_test, predictions, average="weighted"))

print(classification_report(y_test, predictions, digits=4))

Train accuracy =92.75%
Test accuracy =100.00%
Precision score:  1.0
Recall score:  1.0
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000         1

    accuracy                         1.0000         1
   macro avg     1.0000    1.0000    1.0000         1
weighted avg     1.0000    1.0000    1.0000         1

