# Project Notebook

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import nltk
import tensorflow as tf
import gzip

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

## Data exploration

### Cleaning data algorithm

In [None]:
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re, string

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

def clean_single(tweet, stop_words = stopwords.words('english'), numbers=True):

    cleaned_tokens = []
    # Removing links
    tweet = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', tweet)
    # Removing @
    tweet = re.sub("(@[A-Za-z0-9_]+)","", tweet)
        
    # Removing currencies
    tweet = re.sub(r'\$\w*', '', tweet)
    
    tokens = word_tokenize(tweet)
    
    for token, tag in pos_tag(tokens):
                
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
            
            # remove tokens containing numbers
    if numbers:
        numbers = [str(i) for i in range(10)]
        kill_list = []
        for number in numbers:
            kill_list = kill_list + [w for w in cleaned_tokens if number in w]
        # removing selected tokens
        cleaned_tokens = [w for w in cleaned_tokens if not w in kill_list]
        
        # merge tokens
        merged = ' '
        merged = merged.join(cleaned_tokens)
        
    return merged

def clean(array):
    for i, phrase in enumerate(array):
        array[i] = clean_single(phrase)
    return array

### Opening dataset files, cleaning and saving

In [None]:
# df_raw_sentiment = pd.read_csv('data/training.1600000.processed.noemoticon.csv', 
#                           engine='python', 
#                           header=None, 
#                           names=['score', 'tweet_id', 'date', '?', 'account', 'text'])
# df_sentiment = df_raw_sentiment[['score', 'text']]


# classification = np.ones(len(df_sentiment), dtype=np.int)
# classification[df_sentiment['score']==0] = -1

# df_sentiment.insert(loc=2, column='classification', value=classification)
# df_sentiment = df_sentiment[['classification', 'text']]

# sentiment_classification = df_sentiment.classification.values
# sentiment_text = clean(df_sentiment.text.values)

# print(sentiment_classification.shape, sentiment_text.shape)

# temp = pd.DataFrame({'text':sentiment_text,'classification':sentiment_classification})
# temp.to_csv('data/sentiment_cleaned.csv', index=False)

# df_raw_airlines = pd.read_csv('data/Tweets.csv', engine='python')
# df_airlines = df_raw_airlines[['airline_sentiment', 'text']]

# classification = np.zeros(len(df_airlines), dtype=np.int)
# classification[df_airlines['airline_sentiment']=='negative'] = -1
# classification[df_airlines['airline_sentiment']=='neutral'] = 0
# classification[df_airlines['airline_sentiment']=='positive'] = 1

# df_airlines.insert(loc=2, column='classification', value=classification)
# df_airlines = df_airlines[['classification', 'text']]

# airlines_classification = df_airlines.classification.values
# airlines_text = clean(df_airlines.text.values)

# print(airlines_classification.shape, airlines_text.shape)

# temp = pd.DataFrame({'text':airlines_text,'classification':airlines_classification})
# temp.to_csv('data/airlines_cleaned.csv', index=False)

In [None]:
# def parse(path):
#     g = gzip.open(path, 'r')
#     for l in g:
#         yield eval(l)

# reviews = []
# scores = []
# for review in parse('data/reviews_Movies_and_TV_5.json.gz'):
#     reviews.append(clean_single(review['reviewText']))
#     scores.append(review['overall'])
#     if len(scores) in [k*100000 for k in range(1,17)]:
#         print(len(scores))
# scores=np.asarray(scores).astype(dtype=np.int)
# classification = np.zeros(scores.shape, dtype=np.int)
# classification[scores<=2] = -1
# classification[scores>=4] = 1
# temp = pd.DataFrame({'text':reviews, 'classification':classification})
# temp.to_csv('data/amazon_movies.csv')

#### Opening cleaned data

In [None]:
df_airlines = pd.read_csv('data/airlines_cleaned.csv').dropna()
df_airlines.head()

In [None]:
df_sentiment = pd.read_csv('data/sentiment_cleaned.csv').dropna()
df_sentiment.head()

In [None]:
df_amazon = pd.read_csv('data/amazon_movies.csv').dropna()
df_amazon.head()

### Visualisation

#### Sentiment repartition over the datasets

In [None]:
plt.figure(figsize=(14,5))
plt.subplot(1,3,1)
labels = ['positive', 'neutral', 'negative']
sizes = [np.sum(df_airlines.classification==1), np.sum(df_airlines.classification==0), np.sum(df_airlines.classification==-1)]
colors = ['green', 'yellow', 'red']
patches, texts = plt.pie(sizes, colors=colors, shadow=True, startangle=140)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.tight_layout()
plt.title('Sentiment distribution on airlines dataset')

plt.subplot(1,3,2)
labels = ['positive', 'negative']
sizes = [np.sum(df_sentiment.classification==1), np.sum(df_sentiment.classification==-1)]
colors = ['green', 'red']
patches, texts = plt.pie(sizes, colors=colors, shadow=True, startangle=90)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.tight_layout()
plt.title('Sentiment distribution on sentiment dataset')

plt.subplot(1,3,3)
labels = ['positive', 'neutral', 'negative']
sizes = [np.sum(df_amazon.classification==1), np.sum(df_amazon.classification==0), np.sum(df_amazon.classification==-1)]
colors = ['green', 'yellow', 'red']
patches, texts = plt.pie(sizes, colors=colors, shadow=True, startangle=140)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.tight_layout()
plt.title('Sentiment distribution on the dataset of amazon reviews')

plt.savefig('figures/pie_charts_sentiment_distribution.png')
plt.show()

#### Wordclouds for the airlines dataset

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

pos_text = " ".join(df_airlines[df_airlines.classification==1].text.values)
neg_text = " ".join(df_airlines[df_airlines.classification==-1].text.values)
neu_text = " ".join(df_airlines[df_airlines.classification==0].text.values)

wordcloud_pos = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(pos_text)
wordcloud_neg = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(neg_text)
wordcloud_neu = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(neu_text)

plt.figure(figsize=(20, 20))
plt.subplot(1,3,1)
plt.imshow(wordcloud_pos, interpolation="bilinear")
plt.axis("off")
plt.title('Positive reviews')
plt.subplot(1,3,2)
plt.imshow(wordcloud_neu, interpolation="bilinear")
plt.axis("off")
plt.title('Neutral reviews')
plt.subplot(1,3,3)
plt.imshow(wordcloud_neg, interpolation="bilinear")
plt.axis("off")
plt.title('Negative reviews')
plt.savefig('figures/airline_wordclouds.png')
plt.show()

#### Wordclouds for the sentiment dataset

In [None]:
pos_text = " ".join(df_sentiment[df_sentiment.classification==1].text.values)
neg_text = " ".join(df_sentiment[df_sentiment.classification==-1].text.values)

wordcloud_pos = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(pos_text)
wordcloud_neg = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(neg_text)

plt.figure(figsize=(20, 20))
plt.subplot(1,2,1)
plt.imshow(wordcloud_pos, interpolation="bilinear")
plt.axis("off")
plt.title('Positive reviews')
plt.subplot(1,2,2)
plt.imshow(wordcloud_neg, interpolation="bilinear")
plt.axis("off")
plt.title('Neutral reviews')
plt.savefig('figures/sentiment_wordclouds.png')
plt.show()

#### Wordclouds for the amazon reviews dataset

In [None]:
pos_text = " ".join(df_amazon[df_amazon.classification==1].text.values)
neg_text = " ".join(df_amazon[df_amazon.classification==-1].text.values)
neu_text = " ".join(df_amazon[df_amazon.classification==0].text.values)

wordcloud_pos = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(pos_text)
wordcloud_neg = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(neg_text)
wordcloud_neu = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(neu_text)

plt.figure(figsize=(20, 20))
plt.subplot(1,3,1)
plt.imshow(wordcloud_pos, interpolation="bilinear")
plt.axis("off")
plt.title('Positive reviews')
plt.subplot(1,3,2)
plt.imshow(wordcloud_neu, interpolation="bilinear")
plt.axis("off")
plt.title('Neutral reviews')
plt.subplot(1,3,3)
plt.imshow(wordcloud_neg, interpolation="bilinear")
plt.axis("off")
plt.title('Negative reviews')
plt.savefig('figures/amazon_wordclouds.png')
plt.show()

## Feature extraction

###  TF-IDF Vectorizer without bag of words

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=10,
                             norm='l2',
                             ngram_range=(1,1))

x_train = vectorizer.fit_transform(df_airlines.text.values).toarray()
y_train = df_airlines.classification.values
x_train.shape

### TF-IDF Vectorizer with bag of words

In [None]:
vectorizer_bow = TfidfVectorizer(min_df=10,
                                 norm='l2',
                                 ngram_range=(1,3))

x_train_bow = vectorizer_bow.fit_transform(df_airlines.text.values).toarray()
x_train_bow.shape

## Classification

### Naive Bayes

#### Without bag of words

In [None]:
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB

kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train), total=4):
    classifier = MultinomialNB()
    classifier.fit(x_train[train_index], y_train[train_index])
    scores.append(classifier.score(x_train[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores))

#### With bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train_bow), total=4):
    classifier = MultinomialNB()
    classifier.fit(x_train_bow[train_index], y_train[train_index])
    scores.append(classifier.score(x_train_bow[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores))

### SVM

#### Without bag of words

In [None]:
from sklearn.svm import SVC

kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train), total=4):
    classifier = SVC(kernel='rbf')
    classifier.fit(x_train[train_index], y_train[train_index])
    scores.append(classifier.score(x_train[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores))

#### With bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train_bow), total=4):
    classifier = SVC(kernel='rbf')
    classifier.fit(x_train_bow[train_index], y_train[train_index])
    scores.append(classifier.score(x_train_bow[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores))

### Random forest

#### Without bag of words

In [None]:
from sklearn.ensemble import RandomForestRegressor

tmp = time()

kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train), total=4):
    classifier = RandomForestRegressor(n_estimators=50)
    classifier.fit(x_train[train_index], y_train[train_index])
    scores.append(classifier.score(x_train[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores)), time()-tmp

#### With bag of words

In [None]:
tmp = time()

kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train_bow), total=4):
    classifier = RandomForestRegressor(n_estimators=50)
    classifier.fit(x_train_bow[train_index], y_train[train_index])
    scores.append(classifier.score(x_train_bow[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores)), time()-tmp

### Boosted Trees

#### Without bag of words

In [None]:
from sklearn.ensemble import AdaBoostClassifier

kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train), total=4):
    classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.)
    classifier.fit(x_train[train_index], y_train[train_index])
    scores.append(classifier.score(x_train[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores))

#### With bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train_bow), total=4):
    classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.)
    classifier.fit(x_train_bow[train_index], y_train[train_index])
    scores.append(classifier.score(x_train_bow[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores))

### Fully connected network

#### Data preprocessing

In [None]:
# Converting to one_hot
def one_hot(x):
    classes = np.asarray([-1, 0, 1])
    array = np.zeros((*x.shape, classes.shape[0]), dtype=np.int)
    for i, classe in enumerate(classes):
        vector = np.zeros((1,classes.shape[0]), dtype=np.int)
        vector[:,i]=1
        array[x==classe] = vector
    return classes, array

classes, y_train_oh = one_hot(y_train)

#### Model creation

In [None]:
from tensorflow.keras.layers import Dense, InputLayer, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.metrics import CategoricalAccuracy, Precision, Recall, AUC
from tensorflow.keras.regularizers import l2

def FCN(x_train, y_train):
    model = Sequential()
    model.add(InputLayer(input_shape=(x_train.shape[1])))
    model.add(Dense(units=32,
                    activation='elu',
                    kernel_initializer='lecun_normal'))
    model.add(Dropout(rate=0.4))
    
    model.add(Dense(units=16,
                    activation='elu',
                    kernel_initializer='lecun_normal'))
    model.add(Dropout(rate=0.4))
    
    model.add(Dense(units=y_train.shape[1], 
                    activation='softmax', 
                    kernel_initializer='lecun_normal'))
    
    METRICS = [CategoricalAccuracy(name='accuracy'),
               Precision(name='precision'),
               Recall(name='recall'),
               AUC(name='auc')]

    optimizer = Adam(lr=1e-5)
    model.compile(optimizer=optimizer,
                 loss='categorical_crossentropy',
                 metrics=METRICS)
    model.summary()
    
    earlystop = EarlyStopping(monitor='val_loss',
                             patience=15,
                             restore_best_weights=True)
    reduceLR = ReduceLROnPlateau(monitor='val_loss',
                                factor=np.sqrt(1e-1),
                                verbose=1,
                                patience=5)
    history = model.fit(x=x_train,
                        y=y_train,
                        batch_size=32,
                        epochs=10000,
                        verbose=0,
                        callbacks=[earlystop, reduceLR],
                        validation_split=0.3,
                        shuffle=True,
                        workers=2)
    
    return model

#### Without bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train_bow), total=4):
    model = FCN(x_train[train_index], y_train_oh[train_index])
    scores.append(model.evaluate(x_train[test_index], y_train_oh[test_index]))
                
print(np.mean(np.asarray(scores), axis=0))

#### With bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train_bow), total=4):
    model = FCN(x_train_bow[train_index], y_train_oh[train_index])
    scores.append(model.evaluate(x_train_bow[test_index], y_train_oh[test_index]))
    
print(np.mean(np.asarray(scores), axis=0))

### LSTM network

#### Data preprocessing

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_features = 2048 # around number of unigrams in the data
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df_airlines.text.values)

x_train_seq = tokenizer.texts_to_sequences(df_airlines.text.values)
x_train_seq = pad_sequences(x_train_seq)

print(x_train_seq.shape)

#### Model creation

In [None]:
from tensorflow.keras.layers import Bidirectional, LSTM, Embedding, SpatialDropout1D


def RNN(x_train, y_train):
    model = Sequential()
    model.add(Embedding(max_features, 256, embeddings_initializer='lecun_normal'))
    model.add(SpatialDropout1D(rate=0.4))
    model.add(Bidirectional(LSTM(units=64, dropout=0.4, recurrent_dropout=0.4)))
    
    model.add(Dense(units=y_train.shape[1], 
                    activation='softmax', 
                    kernel_initializer='lecun_normal'))
    
    METRICS = [CategoricalAccuracy(name='accuracy'),
               Precision(name='precision'),
               Recall(name='recall'),
               AUC(name='auc')]

    optimizer = Adam(lr=1e-5)
    model.compile(optimizer=optimizer,
                 loss='categorical_crossentropy',
                 metrics=METRICS)
    model.summary()
    
    earlystop = EarlyStopping(monitor='val_loss',
                             patience=15,
                             restore_best_weights=True)
    reduceLR = ReduceLROnPlateau(monitor='val_loss',
                                factor=np.sqrt(1e-1),
                                verbose=1,
                                patience=5)
    history = model.fit(x=x_train,
                        y=y_train,
                        batch_size=32,
                        epochs=10000,
                        verbose=2,
                        callbacks=[earlystop, reduceLR],
                        validation_split=0.3,
                        shuffle=True,
                        workers=3)
    
    return model

#### Macro parameters exploration

In [None]:
kf = KFold(n_splits=4, shuffle=True) 
all_scores = []
scores = []
for train_index, test_index in tqdm(kf.split(x_train_seq), total=4):
    model = RNN(x_train_seq[train_index], y_train_oh[train_index])
    scores.append(model.evaluate(x_train_seq[test_index], y_train_oh[test_index]))

all_scores.append(np.mean(np.asarray(scores), axis=0))
        
print(all_scores)

## End of comparison of models

## Evaluation on test datasets

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [None]:
def evaluate(y_true, y_pred):
    return [accuracy_score(y_true, y_pred), precision_score(y_true, y_pred),recall_score(y_true, y_pred),roc_auc_score(y_true, y_pred)]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=10,
                             norm='l2',
                             ngram_range=(1,1))
vectorizer_bow = TfidfVectorizer(min_df=10,
                                 norm='l2',
                                 ngram_range=(1,3))

max_features = 2048 # around number of unigrams in the data
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df_airlines.text.values)

x_train = vectorizer.fit_transform(df_airlines.text.values).toarray()
x_train_bow = vectorizer_bow.fit_transform(df_airlines.text.values).toarray()
x_train_seq = pad_sequences(tokenizer.texts_to_sequences(df_airlines.text.values))
y_train = df_airlines.classification.values
y_train_oh = one_hot(y_train)

x_sentiment = vectorizer.transform(df_sentiment.text.values).toarray()
x_sentiment_bow = vectorizer_bow.transform(df_sentiment.text.values).toarray()
y_sentiment_seq = pad_sequences(tokenizer.texts_to_sequences(df_sentiment.text.values))
y_sentiment = df_sentiment.classification.values
y_sentiment_oh = one_hot(y_sentiment)

x_amazon = vectorizer.transform(df_amazon.text.values).toarray()
x_amazon_bow = vectorizer_bow.transform(df_amazon.text.values).toarray()
y_amazon_seq = pad_sequences(tokenizer.texts_to_sequences(df_amazon.text.values))
y_amazon = df_amazon.classification.values
y_amazon_oh = one_hot(y_amazon)

In [None]:
amazon_nb = []
amazon_svm = []
amazon_adaboost = []
amazon_ann = []
amazon_rnn = []

sentiment_nb = []
sentiment_svm = []
sentiment_adaboost = []
sentiment_ann = []
sentiment_rnn = []

airlines_nb = []
airlines_svm = []
airlines_adaboost = []
airlines_ann = []
airlines_rnn = []

for k in range(5): #needs to average the results
    for train_index, test_index in tqdm(kf.split(x_train), total=4):
        # Naive Bayes
        classifier = MultinomialNB()
        classifier.fit(x_train[train_index], y_train[train_index])
            
        sentiment_nb.append(evaluate(y_sentiment, classifier.predict(x_sentiment)))
        amazon_nb.append(evaluate(y_amazon, classifier.predict(x_amazon)))
        airlines_nb.append(evaluate(y_train[test_index], classifier.predict(x_train[test_index])))
        # SVM
        classifier = SVC(kernel='rbf')
        classifier.fit(x_train[train_index], y_train[train_index])
            
        sentiment_svm.append(evaluate(y_sentiment, classifier.predict(x_sentiment)))
        amazon_svm.append(evaluate(y_amazon, classifier.predict(x_amazon)))
        airlines_svm.append(evaluate(y_train[test_index], classifier.predict(x_train[test_index])))
        # AdaBoost
        classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.)
        classifier.fit(x_train[train_index], y_train[train_index])
            
        sentiment_adaboost.append(evaluate(y_sentiment, classifier.predict(x_sentiment)))
        amazon_adaboost.append(evaluate(y_amazon, classifier.predict(x_amazon)))
        airlines_adaboost.append(evaluate(y_train[test_index], classifier.predict(x_train[test_index])))
        # ANN
        model = FCN(x_train[train_index], y_train_oh[train_index])
        sentiment_ann.append(model.evaluate(x_sentiment, y_sentiment_oh)[1:])
        amazon_ann.append(model.evaluate(x_amazon, y_amazon_oh)[1:])
        airlines_ann.append(model.evaluate(x_train[test_index], y_train_oh[test_index])[1:])
        # RNN
        model = RNN(x_train_seq[train_index], y_train_oh[train_index])
        sentiment_rnn.append(model.evaluate(x_sentiment_seq, y_sentiment_oh)[1:])
        amazon_rnn.append(model.evaluate(x_amazon_seq, y_amazon_oh)[1:])
        airlines_rnn.append(model.evaluate(x_train_seq[test_index], y_train_oh[test_index])[1:])
            
amazon = pd.DataFrame({'NB':amazon_nb, 'SVM':amazon_svm, 'AdaBoost':amazon_adaboost, 'ANN':amazon_ann, 'RNN':amazon_rnn})
amazon.to_csv('amazon_scores.csv')

sentiment = pd.DataFrame({'NB':sentiment_nb, 'SVM':sentiment_svm, 'AdaBoost':sentiment_adaboost, 'ANN':sentiment_ann, 'RNN':sentiment_rnn})
sentiment.to_csv('sentiment_scores.csv')         

airlines = pd.DataFrame({'NB':airlines_nb, 'SVM':airlines_svm, 'AdaBoost':airlines_adaboost, 'ANN':airlines_ann, 'RNN':airlines_rnn})
airlines.to_csv('airlines_scores.csv')

In [None]:
amazon_nb = []
amazon_svm = []
amazon_adaboost = []
amazon_ann = []
amazon_rnn = []

sentiment_nb = []
sentiment_svm = []
sentiment_adaboost = []
sentiment_ann = []
sentiment_rnn = []

airlines_nb = []
airlines_svm = []
airlines_adaboost = []
airlines_ann = []
airlines_rnn = []

x_train = x_train_bow
x_amazon = x_amazon_bow
x_sentiment = x_sentiment_bow

for k in range(5): #needs to average the results
    for train_index, test_index in tqdm(kf.split(x_train), total=4):
        # Naive Bayes
        classifier = MultinomialNB()
        classifier.fit(x_train[train_index], y_train[train_index])
            
        sentiment_nb.append(evaluate(y_sentiment, classifier.predict(x_sentiment)))
        amazon_nb.append(evaluate(y_amazon, classifier.predict(x_amazon)))
        airlines_nb.append(evaluate(y_train[test_index], classifier.predict(x_train[test_index])))
        # SVM
        classifier = SVC(kernel='rbf')
        classifier.fit(x_train[train_index], y_train[train_index])
            
        sentiment_svm.append(evaluate(y_sentiment, classifier.predict(x_sentiment)))
        amazon_svm.append(evaluate(y_amazon, classifier.predict(x_amazon)))
        airlines_svm.append(evaluate(y_train[test_index], classifier.predict(x_train[test_index])))
        # AdaBoost
        classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.)
        classifier.fit(x_train[train_index], y_train[train_index])
            
        sentiment_adaboost.append(evaluate(y_sentiment, classifier.predict(x_sentiment)))
        amazon_adaboost.append(evaluate(y_amazon, classifier.predict(x_amazon)))
        airlines_adaboost.append(evaluate(y_train[test_index], classifier.predict(x_train[test_index])))
        # ANN
        model = FCN(x_train[train_index], y_train_oh[train_index])
        sentiment_ann.append(model.evaluate(x_sentiment, y_sentiment_oh)[1:])
        amazon_ann.append(model.evaluate(x_amazon, y_amazon_oh)[1:])
        airlines_ann.append(model.evaluate(x_train[test_index], y_train_oh[test_index])[1:])
        # RNN
        model = RNN(x_train_seq[train_index], y_train_oh[train_index])
        sentiment_rnn.append(model.evaluate(x_sentiment_seq, y_sentiment_oh)[1:])
        amazon_rnn.append(model.evaluate(x_amazon_seq, y_amazon_oh)[1:])
        airlines_rnn.append(model.evaluate(x_train_seq[test_index], y_train_oh[test_index])[1:])
            
amazon = pd.DataFrame({'NB':amazon_nb, 'SVM':amazon_svm, 'AdaBoost':amazon_adaboost, 'ANN':amazon_ann, 'RNN':amazon_rnn})
amazon.to_csv('amazon_scores_bow.csv')

sentiment = pd.DataFrame({'NB':sentiment_nb, 'SVM':sentiment_svm, 'AdaBoost':sentiment_adaboost, 'ANN':sentiment_ann, 'RNN':sentiment_rnn})
sentiment.to_csv('sentiment_scores_bow.csv')         

airlines = pd.DataFrame({'NB':airlines_nb, 'SVM':airlines_svm, 'AdaBoost':airlines_adaboost, 'ANN':airlines_ann, 'RNN':airlines_rnn})
airlines.to_csv('airlines_scores_bow.csv')