In [1]:
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Packages for modeling
from keras import models
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense, Dropout

Using TensorFlow backend.


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utkarsh.goyal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_csv('Tweets.csv')
df = df.reindex(np.random.permutation(df.index))  
df = df[['text', 'airline_sentiment']]
df.head()

Unnamed: 0,text,airline_sentiment
11693,@USAirways @AmericanAir Your staff members are...,negative
3478,@united Trying to locate passenger that landed...,neutral
7862,@JetBlue if you want to fly in a storm that's ...,negative
7745,"@JetBlue If this is customer service, then ple...",negative
2685,@united I'm Cancelled Flighting my #mileageplu...,negative


In [4]:
def remove_stopwords(input_text):
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 

In [5]:
df.text = df.text.apply(remove_stopwords)
df.head()

Unnamed: 0,text,airline_sentiment
11693,@USAirways @AmericanAir Your staff members loo...,negative
3478,@united Trying locate passenger landed hrs ago...,neutral
7862,@JetBlue want fly storm that's right. But give...,negative
7745,"@JetBlue If customer service, please call me.",negative
2685,@united I'm Cancelled Flighting #mileageplus c...,negative


In [6]:
def remove_mentions(input_text):
        return re.sub(r'@\w+', '', input_text)
    
df.text = df.text.apply(remove_mentions)
df.head()

Unnamed: 0,text,airline_sentiment
11693,Your staff members looking like two heads. C...,negative
3478,Trying locate passenger landed hrs ago UA938...,neutral
7862,want fly storm that's right. But give us choi...,negative
7745,"If customer service, please call me.",negative
2685,I'm Cancelled Flighting #mileageplus card NEV...,negative


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.text, df.airline_sentiment, test_size=0.25, random_state=7)
print('# Train data samples:', X_train.shape[0])
print('# Test data samples:', X_test.shape[0])

# Train data samples: 10980
# Test data samples: 3660


In [8]:
tk = Tokenizer(num_words=25000,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ")

tk.fit_on_texts(X_train)

print('Fitted tokenizer on {} documents'.format(tk.document_count))
print('{} words in dictionary'.format(tk.num_words))
print('Top 5 most common words are:', collections.Counter(tk.word_counts).most_common(5))

Fitted tokenizer on 10980 documents
25000 words in dictionary
Top 5 most common words are: [('flight', 2998), ('not', 1202), ('no', 1128), ('get', 1013), ('t', 887)]


In [9]:
X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

print('"{}" is converted into {}'.format(X_train[0], X_train_seq[0]))

" What  said." is converted into [30, 322, 5453, 5454, 302, 323, 423, 669, 480, 164, 53, 1474, 1376]


In [10]:
def one_hot_seq(seqs, nb_features = 25000):
    ohs = np.zeros((len(seqs), nb_features))
    for i, s in enumerate(seqs):
        ohs[i, s] = 1.
    return ohs

X_train_oh = one_hot_seq(X_train_seq)
X_test_oh = one_hot_seq(X_test_seq)

print('"{}" is converted into {}'.format(X_train_seq[0], X_train_oh[0]))
print('For this example we have {} features with a value of 1.'.format(X_train_oh[0].sum()))

"[30, 322, 5453, 5454, 302, 323, 423, 669, 480, 164, 53, 1474, 1376]" is converted into [0. 0. 0. ... 0. 0. 0.]
For this example we have 13.0 features with a value of 1.


In [11]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

print('"{}" is converted into {}'.format(y_train[0], y_train_le[0]))
print('"{}" is converted into {}'.format(y_train_le[0], y_train_oh[0]))

"neutral" is converted into 2
"2" is converted into [0. 0. 1.]


# Modeling

In [12]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [13]:
model_params = {
#     'svm': {
#         'model': SVC(gamma='auto',max_iter=-1),
#         'params' : {
#             'C': [1,10,20],
#             # 'kernel': ['linear', 'rbf',],
#     #         #'shrinking':['True', 'False']
#         }  
#     },

#     'kneighbours': {
#         'model': KNeighborsClassifier(n_jobs=-1),
#         "params": {
            
#             'n_neighbors' : [5, 10, 20, 30],
#         }
#     },
    'random_forest': {
        'model': RandomForestClassifier(n_jobs=-1),
        'params' : {
            'n_estimators': [1,5,10,100],
            'max_depth': [1,2,3,4,5,6],
            'min_samples_leaf': [100,200,300,500,1000],
            'criterion' : ['gini','entropy'], 
        }
    },
#     'logistic_regression' : {
#         'model': LogisticRegression(multi_class='auto'),
#         'params': {
#             'C': [1,5,10],
# #             'solver':['newton-cg', 'liblinear']
#         }
#     },
#     'naive_bayes_gaussian': {
#         'model': GaussianNB(),
#         'params': {}
#     },
#     'naive_bayes_multinomial': {
#         'model': MultinomialNB(),
#         'params': {}
#     },
#     'decision_tree': {
#         'model': DecisionTreeClassifier(),
#         'params': {
#             'criterion': ['gini','entropy'],
#             'max_depth': [1,2,3,4,5,6],
#             'min_samples_leaf': [100,200,300,500,1000]
#         }
#     }     
}

In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []

for model_name, mp in model_params.items():
    print(model_name)
    clf =  GridSearchCV(mp['model'], mp['params'], cv=2, 
                        return_train_score=False)
    clf.fit(X_train_oh, y_train_oh)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
score_df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
score_df

random_forest


# Modeling-Keras

In [None]:
base_model = Sequential()
base_model.add(Dense(64, activation='relu', input_shape=(25000,)))
base_model.add(Dropout(0.5))
base_model.add(Dense(32, activation='relu'))
base_model.add(Dropout(0.5))
base_model.add(Dense(3, activation='softmax'))
base_model.summary()

In [None]:
base_model.compile(optimizer='rmsprop'
                  , loss='categorical_crossentropy'
                  , metrics=['accuracy'])
    
base_model.fit(X_train_oh, y_train_oh, epochs=50, batch_size=1024, verbose=1)

In [None]:
base_model.evaluate(X_test_oh, y_test_oh)