# Text Classification using nltk

### In this notebook, we will use Twitter dataset to do binary classification on texts. Specifically, we train the text classification model by constructing TF-IDF matrix and training on simple models.

In [3]:
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from helpers_simple_ml import load_data_and_labels, create_submission_file, normalization

sns.set()
%matplotlib inline

In [4]:
# load training and testing set
positive_txt_path = 'data/train_pos_full.txt'
negative_txt_path = 'data/train_neg_full.txt'
test_txt_path = 'data/test_data.txt'
train, label, test = load_data_and_labels(positive_txt_path, negative_txt_path, test_txt_path)

In [5]:
# transform training data into a dataframe
tweet_df = pd.DataFrame.from_dict({'tweet': train, 'label': label})
tweet_df.head()

Unnamed: 0,tweet,label
0,dunno justin read mention justin god knows hop...,1
1,logic dumb wo even crop name photo tsk,1
2,put casper box looved battle crakkbitch,1
3,thanks sir trip lil mama keep doin ya thang,1
4,visiting brother tmr bestest birthday gift eve...,1


In [6]:
# check the number of each label in training data
tweet_df['label'].value_counts()

-1    1250000
 1    1250000
Name: label, dtype: int64

In [7]:
# apply normalization to each row
tweet_df['tweet'] = (tweet_df['tweet'].str.split()).apply(normalization)

tweet_df.head()

Unnamed: 0,tweet,label
0,dunno justin read mention justin god know hope...,1
1,logic dumb wo even crop name photo tsk,1
2,put casper box looved battle crakkbitch,1
3,thank sir trip lil mama keep doin ya thang,1
4,visit brother tmr bestest birthday gift eveerrr,1


In [13]:
# convert string tweets to weighted TF-IDF matrix and apply logistic regression as the classifier
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer='word', ngram_range=(1, 3), max_df=1.0)),
    ('tfidf', TfidfTransformer()),
    ('classifier', LogisticRegression(solver='saga', C=10))
])

In [14]:
# split the original training dataset into training set and validation set
tweet_train, tweet_val, label_train, label_val = train_test_split(tweet_df['tweet'], tweet_df['label'], test_size = 0.2)

# train and predict
pipeline.fit(tweet_train, label_train)
predictions = pipeline.predict(tweet_val)

# result
print(classification_report(predictions, label_val))
print(confusion_matrix(predictions,label_val))
print(accuracy_score(predictions,label_val))

              precision    recall  f1-score   support

          -1       0.83      0.83      0.83    249396
           1       0.83      0.83      0.83    250604

    accuracy                           0.83    500000
   macro avg       0.83      0.83      0.83    500000
weighted avg       0.83      0.83      0.83    500000

[[207154  42242]
 [ 42959 207645]]
0.829598


In [10]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer='word', ngram_range=(1, 3), max_df=1.0)),
    ('tfidf', TfidfTransformer()),
    ('classifier', GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.87, max_depth = 5))
])

In [11]:
# train and predict
pipeline.fit(tweet_train, label_train)
predictions = pipeline.predict(tweet_val)

# result
print(classification_report(predictions, label_val))
print(confusion_matrix(predictions,label_val))
print(accuracy_score(predictions,label_val))

              precision    recall  f1-score   support

          -1       0.69      0.82      0.75    207610
           1       0.86      0.73      0.79    292390

    accuracy                           0.77    500000
   macro avg       0.77      0.78      0.77    500000
weighted avg       0.79      0.77      0.77    500000

[[171256  36354]
 [ 77774 214616]]
0.771744


In [11]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer='word', ngram_range=(1, 3), max_df=1.0)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB(alpha = 0.1))
])

In [12]:
# train and predict
pipeline.fit(tweet_train, label_train)
predictions = pipeline.predict(tweet_val)

# result
print(classification_report(predictions, label_val))
print(confusion_matrix(predictions,label_val))
print(accuracy_score(predictions,label_val))

              precision    recall  f1-score   support

          -1       0.75      0.84      0.79    221968
           1       0.86      0.77      0.82    278032

    accuracy                           0.81    500000
   macro avg       0.81      0.81      0.80    500000
weighted avg       0.81      0.81      0.81    500000

[[187320  34648]
 [ 62810 215222]]
0.805084


In [12]:
# generate prediction of testing data and save the file
predictions = pipeline.predict(pd.Series(test))
create_submission_file(predictions, "TWN1_submission.csv")