<a href="https://colab.research.google.com/github/yudumpacin/NLPStudyNotes/blob/main/TextClassification_Spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1) Data Preprocessing


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [17]:
amazon = pd.read_csv("amazon_cells_labelled.txt", sep="\t", names=["review","label"])
yelp = pd.read_csv("yelp_labelled.txt", sep="\t", names=["review","label"])
imdb = pd.read_csv("imdb_labelled.txt", sep="\t", names=["review","label"])

In [19]:
temp = pd.concat([amazon,yelp])

In [20]:
data = pd.concat([temp,imdb])

In [21]:
data.head()

Unnamed: 0,review,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [22]:
data.shape

(2748, 2)

In [57]:
data["label"].value_counts()

1    1386
0    1362
Name: label, dtype: int64

#2) Data Cleaning and Tokenization

In [23]:
import spacy

In [24]:
nlp = spacy.load("en_core_web_sm")

In [41]:
review = data.iloc[3,0]
review

'Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!'

In [42]:
doc = nlp(review)

In [44]:
for token in doc:
  print(token, token.lemma_.lower(), token.pos_)

Tied tie VERB
to to ADP
charger charger NOUN
for for ADP
conversations conversation NOUN
lasting last VERB
more more ADJ
than than ADP
45 45 NUM
minutes minute NOUN
. . PUNCT
MAJOR major PROPN
PROBLEMS problems PROPN
! ! PUNCT
! ! PUNCT


In [46]:
from spacy.lang.en.stop_words import STOP_WORDS

In [47]:
stopwords = list(STOP_WORDS)

In [49]:
"not" in stopwords

True

In [50]:
"no" in stopwords

True

In [55]:
stopwords.remove("not")

In [56]:
stopwords.remove("no")

In [71]:
[token.lemma_.lower() for token in doc if token.lemma_.lower() not in stopwords and not(token.is_punct)]

['tie', 'charger', 'conversation', '45', 'minute', 'major', 'problems']

In [92]:
def tokenize(sentence):
  doc = nlp(sentence)
  token = [token.lemma_.lower() for token in doc if token.lemma_.lower() not in stopwords and not(token.is_punct)]
  return " ".join(token)

In [93]:
data["review_cleaned"] = data["review"].apply(tokenize)

In [94]:
data.head()

Unnamed: 0,review,label,review_cleaned
0,So there is no way for me to plug it in here i...,0,no way plug converter
1,"Good case, Excellent value.",1,good case excellent value
2,Great for the jawbone.,1,great jawbone
3,Tied to charger for conversations lasting more...,0,tie charger conversation 45 minute major problems
4,The mic is great.,1,mic great


In [108]:
import string

In [110]:
punct = string.punctuation

In [111]:
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [112]:
def text_data_cleaning(sentence):
  doc = nlp(sentence)
  tokens = []
  for token in doc:
    if token.lemma_!="-PRON-":
      temp = token.lemma_.lower().strip()
    else:
      temp = token.lower_
    tokens.append(temp)

    cleaned_tokens = []
    for token in tokens:
      if token not in stopwords and token not in punct:
        cleaned_tokend.append(token)
    return cleaned_tokens

In [113]:
tfidf = TfidfVectorizer(tokenizer =text_data_cleaning)

#3) Model Building

In [95]:
from sklearn.model_selection import train_test_split

In [96]:
X_train, X_test, y_train, y_test = train_test_split(data["review_cleaned"],data["label"], test_size=0.3, random_state=0)

In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [98]:
from sklearn.pipeline import Pipeline

In [99]:
from sklearn.metrics import f1_score,confusion_matrix,classification_report

In [104]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC,LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import FunctionTransformer

In [101]:
classifiers = [
    LogisticRegression(),
    MultinomialNB(),
    RandomForestClassifier(),
    LinearSVC(),
    SVC(kernel="rbf"),
    MLPClassifier()
]

In [102]:
def evaluate_pipeline(pipeline, X_test, y_test):
    predictions = pipeline.predict(X_test)
    f1 = f1_score(y_test, predictions)
    report = classification_report(y_test, predictions)
    return  f1, report

In [107]:
for clf in classifiers:
  print(clf.__class__.__name__)
  tfidf_pipeline = Pipeline([("tfidf",TfidfVectorizer()),("clf",clf)])
  tfidf_pipeline.fit(X_train,y_train)
  score, report = evaluate_pipeline(tfidf_pipeline, X_test, y_test)
  print("tfidf")
  print(f"Model: {clf.__class__.__name__}")
  print(f"F1: {score}")
  print(f"Classification Report:\n{report}\n")
  bow_pipeline = Pipeline([("bow",CountVectorizer()),("to_dense", FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),("clf",clf)])
  bow_pipeline.fit(X_train,y_train)
  score, report = evaluate_pipeline(bow_pipeline, X_test, y_test)
  print("bag of words")
  print(f"Model: {clf.__class__.__name__}")
  print(f"F1: {score}")
  print(f"Classification Report:\n{report}\n")


LogisticRegression
tfidf
Model: LogisticRegression
F1: 0.843230403800475
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.83      0.84       407
           1       0.84      0.85      0.84       418

    accuracy                           0.84       825
   macro avg       0.84      0.84      0.84       825
weighted avg       0.84      0.84      0.84       825


bag of words
Model: LogisticRegression
F1: 0.8279181708784596
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.83      0.83       407
           1       0.83      0.82      0.83       418

    accuracy                           0.83       825
   macro avg       0.83      0.83      0.83       825
weighted avg       0.83      0.83      0.83       825


MultinomialNB
tfidf
Model: MultinomialNB
F1: 0.8128724672228844
Classification Report:
              precision    recall  f1-score   support

           0       0.