# Text Classification

### In this notebook, we will use Twitter dataset to do binary classification on texts. Specifically, we train the text classification model using **word2vec** and training on simple models.

In [1]:
import pandas as pd
import numpy as np
import spacy
import os
import gensim.models.keyedvectors as word2vec
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from helpers_simple_ml import load_data_and_labels, create_submission_file

In [2]:
# load training and testing set
positive_txt_path = 'data/train_pos_full.txt'
negative_txt_path = 'data/train_neg_full.txt'
test_txt_path = 'data/test_data.txt'
train, label, test = load_data_and_labels(positive_txt_path, negative_txt_path, test_txt_path)

In [3]:
# transform training data into a dataframe
tweet_df = pd.DataFrame.from_dict({'tweet': train, 'label': label})
tweet_df.head()

Unnamed: 0,tweet,label
0,dunno justin read mention justin god knows hop...,1
1,logic dumb wo even crop name photo tsk,1
2,put casper box looved battle crakkbitch,1
3,thanks sir trip lil mama keep doin ya thang,1
4,visiting brother tmr bestest birthday gift eve...,1


In [4]:
# check the number of each label in training data
tweet_df['label'].value_counts()

-1    1250000
 1    1250000
Name: label, dtype: int64

In [5]:
# lexicon normalization
def normalization(tweet_list):
    lem = WordNetLemmatizer()
    normalized_tweet = []
    for word in tweet_list:
        normalized_text = lem.lemmatize(word,'v')
        normalized_tweet.append(normalized_text)
    return normalized_tweet
    
# apply normalization to each row
tweet_df['tweet'] = (tweet_df['tweet'].str.split()).apply(normalization)

tweet_df.head()

Unnamed: 0,tweet,label
0,"[dunno, justin, read, mention, justin, god, kn...",1
1,"[logic, dumb, wo, even, crop, name, photo, tsk]",1
2,"[put, casper, box, looved, battle, crakkbitch]",1
3,"[thank, sir, trip, lil, mama, keep, doin, ya, ...",1
4,"[visit, brother, tmr, bestest, birthday, gift,...",1


In [6]:
%%time
if os.path.exists('model_word2vec.bin'):
    # load model
    model = word2vec.KeyedVectors.load_word2vec_format("model_word2vec.bin", binary = True)
else:
    # beware that this could take a day for training
    # train embedding model by word2vec
    print("\n Training the word2vec model...\n")
    model = Word2Vec(tweet_df.tweet.to_list(), size = 100, min_count = 1, sg = 1, workers = 5, iter = 100)
    # save model
    model.wv.save_word2vec_format('model_word2vec.bin', binary=True)

CPU times: user 6.09 s, sys: 224 ms, total: 6.31 s
Wall time: 6.38 s


In [7]:
# check whether there is a tweet with length equal to 0
display(tweet_df[(tweet_df.tweet.apply(len) == 0)])

# remove them
tweet_df_cleaned = tweet_df[~(tweet_df.tweet.apply(len) == 0)]

# check again
display(tweet_df_cleaned[(tweet_df_cleaned.tweet.apply(len) == 0)])

Unnamed: 0,tweet,label
149,[],1
365,[],1
1609,[],1
1981,[],1
1982,[],1
...,...,...
2497388,[],-1
2497548,[],-1
2497798,[],-1
2498992,[],-1


Unnamed: 0,tweet,label


In [8]:
def tweet_vector(tweet):
    """Create document vectors by averaging word vectors."""
    return np.mean(model.wv.__getitem__(tweet), axis=0)

tweet_df_cleaned['vector'] = tweet_df_cleaned.tweet.map(tweet_vector)
tweet_df_cleaned.head()

  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,tweet,label,vector
0,"[dunno, justin, read, mention, justin, god, kn...",1,"[-0.106876895, 0.12449849, -0.069821864, 0.114..."
1,"[logic, dumb, wo, even, crop, name, photo, tsk]",1,"[0.05467985, -0.2727772, -0.012513335, 0.26039..."
2,"[put, casper, box, looved, battle, crakkbitch]",1,"[-0.28428942, -0.07732746, -0.41212776, 0.0288..."
3,"[thank, sir, trip, lil, mama, keep, doin, ya, ...",1,"[-0.3360731, -0.08575468, -0.3192463, 0.205857..."
4,"[visit, brother, tmr, bestest, birthday, gift,...",1,"[-0.38955864, 0.08355774, -0.29862565, 0.42037..."


In [9]:
# split the original training dataset into training set and validation set
tweet_train, tweet_val, label_train, label_val = train_test_split(tweet_df_cleaned['vector'], \
                                                                  tweet_df_cleaned['label'], \
                                                                  test_size = 0.2)

In [14]:
# logistic regression model
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('classifier', LogisticRegression(solver = 'saga', C = 10))
])

pipeline.fit(tweet_train.tolist(), label_train.values)
predictions = pipeline.predict(tweet_val.tolist())

# result
print(classification_report(predictions, label_val))
print(confusion_matrix(predictions,label_val))
print(accuracy_score(predictions,label_val))

              precision    recall  f1-score   support

          -1       0.73      0.76      0.74    238212
           1       0.77      0.74      0.75    260239

    accuracy                           0.75    498451
   macro avg       0.75      0.75      0.75    498451
weighted avg       0.75      0.75      0.75    498451

[[181400  56812]
 [ 68074 192165]]
0.7494518016816096


In [12]:
# gradient boosting classifier
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('classifier', GradientBoostingClassifier(n_estimators = 500, learning_rate = 0.7, max_depth = 5))
])

pipeline.fit(tweet_train.tolist(), label_train.values)
predictions = pipeline.predict(tweet_val.tolist())

# result
print(classification_report(predictions, label_val))
print(confusion_matrix(predictions,label_val))
print(accuracy_score(predictions,label_val))

              precision    recall  f1-score   support

          -1       0.77      0.80      0.79    238003
           1       0.81      0.78      0.79    260448

    accuracy                           0.79    498451
   macro avg       0.79      0.79      0.79    498451
weighted avg       0.79      0.79      0.79    498451

[[191290  46713]
 [ 57958 202490]]
0.7900074430585955


In [11]:
# multinomial naive bayes
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('classifier', MultinomialNB(alpha = 0.1))
])

pipeline.fit(tweet_train.tolist(), label_train.values)
predictions = pipeline.predict(tweet_val.tolist())

# result
print(classification_report(predictions, label_val))
print(confusion_matrix(predictions, label_val))
print(accuracy_score(predictions, label_val))

              precision    recall  f1-score   support

          -1       0.54      0.68      0.60    197419
           1       0.75      0.62      0.68    301032

    accuracy                           0.65    498451
   macro avg       0.65      0.65      0.64    498451
weighted avg       0.67      0.65      0.65    498451

[[134872  62547]
 [114376 186656]]
0.645054378464483


In [None]:
# transform testing data list to series
test_series = pd.Series(test)

# vectorize
test_series = (test_series.str.split()).apply(normalization).apply(tweet_vector)

# generate prediction of testing data and save the file
predictions = gbc.predict(test_series.tolist())
create_submission_file(predictions, "TWN1_submission.csv")