In [None]:
#Yash Shah
#BOW - TF IDF, LDA (Topic Modelling), Classification Algorithm (Random Forest)

# Importing Libraries

In [63]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re
import unidecode
import textstat
import string  

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import NMF
from sklearn.neural_network import MLPClassifier

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [64]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.23.1.


# Reading Data

In [65]:
#Reading the csv file and looking at the schema of data
df = pd.read_csv("Tweets.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [66]:
#Checking the label count (In this case it is almost balanced dataset)
df['airline_sentiment'].value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

# Cleaning Data

In [67]:
#Dropping columns that add zero value
cols= ['tweet_id','airline_sentiment_gold', 'name', 'negativereason_gold', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone']
df = df.drop(cols, axis=1)

#Filling missing values
df['negativereason_confidence'] = df['negativereason_confidence'].fillna(0)
df['negativereason'] = df['negativereason'].fillna(0)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   airline_sentiment             14640 non-null  object 
 1   airline_sentiment_confidence  14640 non-null  float64
 2   negativereason                14640 non-null  object 
 3   negativereason_confidence     14640 non-null  float64
 4   airline                       14640 non-null  object 
 5   retweet_count                 14640 non-null  int64  
 6   text                          14640 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 800.8+ KB


# Encoding

In [68]:
cols = ['airline', 'negativereason']

df = pd.concat([df, pd.get_dummies(df[cols])],axis=1)
df = df.drop(cols, axis=1)
df.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason_confidence,retweet_count,text,airline_American,airline_Delta,airline_Southwest,airline_US Airways,airline_United,...,negativereason_Bad Flight,negativereason_Can't Tell,negativereason_Cancelled Flight,negativereason_Customer Service Issue,negativereason_Damaged Luggage,negativereason_Flight Attendant Complaints,negativereason_Flight Booking Problems,negativereason_Late Flight,negativereason_Lost Luggage,negativereason_longlines
0,neutral,1.0,0.0,0,@VirginAmerica What @dhepburn said.,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,positive,0.3486,0.0,0,@VirginAmerica plus you've added commercials t...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,neutral,0.6837,0.0,0,@VirginAmerica I didn't today... Must mean I n...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,negative,1.0,0.7033,0,@VirginAmerica it's really aggressive to blast...,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,negative,1.0,1.0,0,@VirginAmerica and it's a really big bad thing...,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


# Text Analytics - Cleaning the text column

In [69]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import unidecode

stop_words = set(stopwords.words('english'))

lemmer = WordNetLemmatizer()

def preprocess(x):
    # Lowercase function - Make all text lower case
    x = x.lower()
    
    #Replace words with @ with blank
    x = re.sub('\S*@\S*\s?', '', x)
    
    #Replace special characters with blank
    x = re.sub('[^\w\s]','',x)
    
    # lemmatize the text data
    x = [lemmer.lemmatize(w) for w in x.split() if w not in stop_words]

    return ' '.join(x)

df['text_clean'] = df['text'].apply(preprocess)
df.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason_confidence,retweet_count,text,airline_American,airline_Delta,airline_Southwest,airline_US Airways,airline_United,...,negativereason_Can't Tell,negativereason_Cancelled Flight,negativereason_Customer Service Issue,negativereason_Damaged Luggage,negativereason_Flight Attendant Complaints,negativereason_Flight Booking Problems,negativereason_Late Flight,negativereason_Lost Luggage,negativereason_longlines,text_clean
0,neutral,1.0,0.0,0,@VirginAmerica What @dhepburn said.,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,said
1,positive,0.3486,0.0,0,@VirginAmerica plus you've added commercials t...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,plus youve added commercial experience tacky
2,neutral,0.6837,0.0,0,@VirginAmerica I didn't today... Must mean I n...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,didnt today must mean need take another trip
3,negative,1.0,0.7033,0,@VirginAmerica it's really aggressive to blast...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,really aggressive blast obnoxious entertainmen...
4,negative,1.0,1.0,0,@VirginAmerica and it's a really big bad thing...,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,really big bad thing


# Feature Engineering

In [70]:
df['len'] = df['text_clean'].apply(lambda x: len(x))
df['syllable_count'] = df['text_clean'].apply(lambda x: textstat.syllable_count(x))
df['lexicon_count'] = df['text_clean'].apply(lambda x: textstat.lexicon_count(x))

df.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason_confidence,retweet_count,text,airline_American,airline_Delta,airline_Southwest,airline_US Airways,airline_United,...,negativereason_Damaged Luggage,negativereason_Flight Attendant Complaints,negativereason_Flight Booking Problems,negativereason_Late Flight,negativereason_Lost Luggage,negativereason_longlines,text_clean,len,syllable_count,lexicon_count
0,neutral,1.0,0.0,0,@VirginAmerica What @dhepburn said.,0,0,0,0,0,...,0,0,0,0,0,0,said,4,1,1
1,positive,0.3486,0.0,0,@VirginAmerica plus you've added commercials t...,0,0,0,0,0,...,0,0,0,0,0,0,plus youve added commercial experience tacky,44,12,6
2,neutral,0.6837,0.0,0,@VirginAmerica I didn't today... Must mean I n...,0,0,0,0,0,...,0,0,0,0,0,0,didnt today must mean need take another trip,44,12,8
3,negative,1.0,0.7033,0,@VirginAmerica it's really aggressive to blast...,0,0,0,0,0,...,0,0,0,0,0,0,really aggressive blast obnoxious entertainmen...,78,21,10
4,negative,1.0,1.0,0,@VirginAmerica and it's a really big bad thing...,0,0,0,0,0,...,0,0,0,0,0,0,really big bad thing,20,6,4


# Bag Of Words - TF IDF

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

#TF calculation
tfidf_vectorizer = TfidfVectorizer(min_df=.02, max_df=.5, ngram_range=[1,3], max_features=1500, stop_words='english')
dtm_tfidf = tfidf_vectorizer.fit_transform(df['text_clean'])

#IDF Calculation
bow_df_tfidf = pd.DataFrame(dtm_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names(), index=df.index)
bow_df_tfidf.shape

#TF - IDF Matrix
df_bow_tfidf = pd.concat([df, bow_df_tfidf], axis=1)

#Dropping columns text and text_clean
df_bow_tfidf.drop(columns=['text_clean', 'text'], inplace=True)

#Viewing Dataset
df_bow_tfidf.shape
df_bow_tfidf.head()

(14640, 52)

(14640, 76)

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason_confidence,retweet_count,airline_American,airline_Delta,airline_Southwest,airline_US Airways,airline_United,airline_Virgin America,...,thanks,ticket,time,today,tomorrow,trying,wait,waiting,way,weather
0,neutral,1.0,0.0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,positive,0.3486,0.0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,neutral,0.6837,0.0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.73813,0.0,0.0,0.0,0.0,0.0,0.0
3,negative,1.0,0.7033,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,negative,1.0,1.0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Final Dataset - Model 

In [72]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

y = df_bow_tfidf['airline_sentiment']
X = df_bow_tfidf.loc[:, df_bow_tfidf.columns != 'airline_sentiment']

feature_names = X.columns

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape
X_test.shape
y_train.shape
y_test.shape

(11712, 75)

(2928, 75)

(11712,)

(2928,)

# Model Development - SVM

In [73]:
from sklearn import svm
#IMPORTING LIBRARIES REQUIRED FOR MODELLING
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.model_selection import RepeatedKFold
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score, classification_report, confusion_matrix, make_scorer, f1_score,accuracy_score, cohen_kappa_score, log_loss

#Parameters for SVM
param_grid_svm = {'kernel': ['rbf'], 'gamma': [0.001, 0.01,0.1, 0.0001],
                     'C': [1, 10, 100, 1000, 10000]}
svm_model = svm.SVC(probability=True,random_state=77300)

#pipeline for SVM
pipe = Pipeline([('svm', svm_model)])
cv = RepeatedKFold(n_splits=5, n_repeats=2)

new_params = {'SVM__' + key: param_grid_svm[key] for key in param_grid_svm}

#Model to grid search parameters for svm
grid_search_rf = GridSearchCV(estimator = pipe, param_grid = new_params, 
                          cv = cv ,  scoring = 'f1_micro',return_train_score = True)

In [74]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('svm', SVC(probability=True, random_state=77300))])

# Accuracy

In [75]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, f1_score

pred_val = pipe.predict(X_test)

print("Confusion matrix:")
print(confusion_matrix(y_test, pred_val))

print("\nF1 Score = {:.5f}".format(f1_score(y_test, pred_val, average="micro")))

print("\nClassification Report:")
print(classification_report(y_test, pred_val))

Confusion matrix:
[[1889    0    0]
 [   0  512   68]
 [   0  184  275]]

F1 Score = 0.91393

Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      1889
     neutral       0.74      0.88      0.80       580
    positive       0.80      0.60      0.69       459

    accuracy                           0.91      2928
   macro avg       0.85      0.83      0.83      2928
weighted avg       0.92      0.91      0.91      2928



# TOPIC MODELLING - LDA Model

In [76]:

from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components=20,
                                      doc_topic_prior=None,
                                      topic_word_prior=None,
                                      max_iter=200, 
                                      learning_method='batch', 
                                      random_state=123,
                                      n_jobs=2,
                                      verbose=0)
lda_output = lda_model.fit(dtm_tfidf)

# Log Likelyhood: Higher the better
ll = lda_model.score(dtm_tfidf)

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
perp = lda_model.perplexity(dtm_tfidf)

In [77]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model, dtm_tfidf, tfidf_vectorizer, mds="tsne")