# Project Notebook

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import nltk
import tensorflow as tf
import gzip
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/yann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yann/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/yann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yann/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# Machine learning librairies and tools
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer, Dropout
from tensorflow.keras.layers import Bidirectional, LSTM, Embedding, SpatialDropout1D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.metrics import CategoricalAccuracy, Precision, Recall, AUC

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

## Data exploration

### Cleaning data algorithm

In [None]:
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re, string

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

def clean_single(tweet, stop_words = stopwords.words('english'), numbers=True):

    cleaned_tokens = []
    # Removing links
    tweet = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', tweet)
    # Removing @
    tweet = re.sub("(@[A-Za-z0-9_]+)","", tweet)
        
    # Removing currencies
    tweet = re.sub(r'\$\w*', '', tweet)
    
    tokens = word_tokenize(tweet)
    
    for token, tag in pos_tag(tokens):
                
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
            
            # remove tokens containing numbers
    if numbers:
        numbers = [str(i) for i in range(10)]
        kill_list = []
        for number in numbers:
            kill_list = kill_list + [w for w in cleaned_tokens if number in w]
        # removing selected tokens
        cleaned_tokens = [w for w in cleaned_tokens if not w in kill_list]
        
        # merge tokens
        merged = ' '
        merged = merged.join(cleaned_tokens)
        
    return merged

def clean(array):
    for i, phrase in enumerate(array):
        array[i] = clean_single(phrase)
    return array

### Opening dataset files, cleaning and saving

In [None]:
# df_raw_sentiment = pd.read_csv('data/training.1600000.processed.noemoticon.csv', 
#                           engine='python', 
#                           header=None, 
#                           names=['score', 'tweet_id', 'date', '?', 'account', 'text'])
# df_sentiment = df_raw_sentiment[['score', 'text']]


# classification = np.ones(len(df_sentiment), dtype=np.int)
# classification[df_sentiment['score']==0] = -1

# df_sentiment.insert(loc=2, column='classification', value=classification)
# df_sentiment = df_sentiment[['classification', 'text']]

# sentiment_classification = df_sentiment.classification.values
# sentiment_text = clean(df_sentiment.text.values)

# print(sentiment_classification.shape, sentiment_text.shape)

# temp = pd.DataFrame({'text':sentiment_text,'classification':sentiment_classification})
# temp.to_csv('data/sentiment_cleaned.csv', index=False)

# df_raw_airlines = pd.read_csv('data/Tweets.csv', engine='python')
# df_airlines = df_raw_airlines[['airline_sentiment', 'text']]

# classification = np.zeros(len(df_airlines), dtype=np.int)
# classification[df_airlines['airline_sentiment']=='negative'] = -1
# classification[df_airlines['airline_sentiment']=='neutral'] = 0
# classification[df_airlines['airline_sentiment']=='positive'] = 1

# df_airlines.insert(loc=2, column='classification', value=classification)
# df_airlines = df_airlines[['classification', 'text']]

# airlines_classification = df_airlines.classification.values
# airlines_text = clean(df_airlines.text.values)

# print(airlines_classification.shape, airlines_text.shape)

# temp = pd.DataFrame({'text':airlines_text,'classification':airlines_classification})
# temp.to_csv('data/airlines_cleaned.csv', index=False)

In [None]:
# def parse(path):
#     g = gzip.open(path, 'r')
#     for l in g:
#         yield eval(l)

# reviews = []
# scores = []
# for review in parse('data/reviews_Movies_and_TV_5.json.gz'):
#     reviews.append(clean_single(review['reviewText']))
#     scores.append(review['overall'])
#     if len(scores) in [k*100000 for k in range(1,17)]:
#         print(len(scores))
# scores=np.asarray(scores).astype(dtype=np.int)
# classification = np.zeros(scores.shape, dtype=np.int)
# classification[scores<=2] = -1
# classification[scores>=4] = 1
# temp = pd.DataFrame({'text':reviews, 'classification':classification})
# temp.to_csv('data/amazon_movies.csv')

#### Opening cleaned data

In [3]:
df_airlines = pd.read_csv('data/airlines_cleaned.csv').dropna()
df_airlines.head()

Unnamed: 0,text,classification
0,say,0
1,plus 've add commercial experience ... tacky,1
2,n't today ... must mean need take another trip,0
3,'s really aggressive blast obnoxious `` entert...,-1
4,'s really big bad thing,-1


In [4]:
df_sentiment = pd.read_csv('data/sentiment_cleaned.csv').dropna()
df_sentiment.head()

Unnamed: 0,text,classification
0,awww 's bummer shoulda get david carr third day,-1
1,upset ca n't update facebook texting ... might...,-1
2,dive many time ball managed save rest go bound,-1
3,whole body feel itchy like fire,-1
4,'s behave 'm mad ca n't see,-1


In [5]:
df_amazon = pd.read_csv('data/amazon_movies.csv').dropna()
df_amazon.head()

Unnamed: 0.1,Unnamed: 0,text,classification
0,0,charming version classic dicken 's tale henry ...,1
1,1,good emotionally move christmas carol dickens ...,0
2,2,n't get wrong winkler wonderful character acto...,0
3,3,henry winkler good twist classic story convent...,1
4,4,one best scrooge movie henry winkler outdo cas...,1


### Visualisation

#### Sentiment repartition over the datasets

In [None]:
plt.figure(figsize=(14,5))
plt.subplot(1,3,1)
labels = ['positive', 'neutral', 'negative']
sizes = [np.sum(df_airlines.classification==1), np.sum(df_airlines.classification==0), np.sum(df_airlines.classification==-1)]
colors = ['green', 'yellow', 'red']
patches, texts = plt.pie(sizes, colors=colors, shadow=True, startangle=140)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.tight_layout()
plt.title('Sentiment distribution on airlines dataset')

plt.subplot(1,3,2)
labels = ['positive', 'negative']
sizes = [np.sum(df_sentiment.classification==1), np.sum(df_sentiment.classification==-1)]
colors = ['green', 'red']
patches, texts = plt.pie(sizes, colors=colors, shadow=True, startangle=90)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.tight_layout()
plt.title('Sentiment distribution on sentiment dataset')

plt.subplot(1,3,3)
labels = ['positive', 'neutral', 'negative']
sizes = [np.sum(df_amazon.classification==1), np.sum(df_amazon.classification==0), np.sum(df_amazon.classification==-1)]
colors = ['green', 'yellow', 'red']
patches, texts = plt.pie(sizes, colors=colors, shadow=True, startangle=140)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.tight_layout()
plt.title('Sentiment distribution on the dataset of amazon reviews')

plt.savefig('figures/pie_charts_sentiment_distribution.png')
plt.show()

#### Wordclouds for the airlines dataset

In [None]:
pos_text = " ".join(df_airlines[df_airlines.classification==1].text.values)
neg_text = " ".join(df_airlines[df_airlines.classification==-1].text.values)
neu_text = " ".join(df_airlines[df_airlines.classification==0].text.values)

wordcloud_pos = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(pos_text)
wordcloud_neg = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(neg_text)
wordcloud_neu = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(neu_text)

plt.figure(figsize=(20, 20))
plt.subplot(1,3,1)
plt.imshow(wordcloud_pos, interpolation="bilinear")
plt.axis("off")
plt.title('Positive reviews')
plt.subplot(1,3,2)
plt.imshow(wordcloud_neu, interpolation="bilinear")
plt.axis("off")
plt.title('Neutral reviews')
plt.subplot(1,3,3)
plt.imshow(wordcloud_neg, interpolation="bilinear")
plt.axis("off")
plt.title('Negative reviews')
plt.savefig('figures/airline_wordclouds.png')
plt.show()

#### Wordclouds for the sentiment dataset

In [None]:
pos_text = " ".join(df_sentiment[df_sentiment.classification==1].text.values)
neg_text = " ".join(df_sentiment[df_sentiment.classification==-1].text.values)

wordcloud_pos = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(pos_text)
wordcloud_neg = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(neg_text)

plt.figure(figsize=(20, 20))
plt.subplot(1,2,1)
plt.imshow(wordcloud_pos, interpolation="bilinear")
plt.axis("off")
plt.title('Positive reviews')
plt.subplot(1,2,2)
plt.imshow(wordcloud_neg, interpolation="bilinear")
plt.axis("off")
plt.title('Neutral reviews')
plt.savefig('figures/sentiment_wordclouds.png')
plt.show()

#### Wordclouds for the amazon reviews dataset

In [None]:
pos_text = " ".join(df_amazon[df_amazon.classification==1].text.values)
neg_text = " ".join(df_amazon[df_amazon.classification==-1].text.values)
neu_text = " ".join(df_amazon[df_amazon.classification==0].text.values)

wordcloud_pos = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(pos_text)
wordcloud_neg = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(neg_text)
wordcloud_neu = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(neu_text)

plt.figure(figsize=(20, 20))
plt.subplot(1,3,1)
plt.imshow(wordcloud_pos, interpolation="bilinear")
plt.axis("off")
plt.title('Positive reviews')
plt.subplot(1,3,2)
plt.imshow(wordcloud_neu, interpolation="bilinear")
plt.axis("off")
plt.title('Neutral reviews')
plt.subplot(1,3,3)
plt.imshow(wordcloud_neg, interpolation="bilinear")
plt.axis("off")
plt.title('Negative reviews')
plt.savefig('figures/amazon_wordclouds.png')
plt.show()

## Feature extraction

###  TF-IDF Vectorizer without bag of words

In [6]:
vectorizer = TfidfVectorizer(min_df=10,
                             norm='l2',
                             ngram_range=(1,1))

x_train = vectorizer.fit_transform(df_airlines.text.values).toarray()
y_train = df_airlines.classification.values
x_train.shape

(14604, 1522)

### TF-IDF Vectorizer with bag of words

In [7]:
vectorizer_bow = TfidfVectorizer(min_df=10,
                                 norm='l2',
                                 ngram_range=(1,3))

x_train_bow = vectorizer_bow.fit_transform(df_airlines.text.values).toarray()
x_train_bow.shape

(14604, 2290)

## Classification

### Naive Bayes

#### Without bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train), total=4):
    classifier = MultinomialNB()
    classifier.fit(x_train[train_index], y_train[train_index])
    scores.append(classifier.score(x_train[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores))

#### With bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train_bow), total=4):
    classifier = MultinomialNB()
    classifier.fit(x_train_bow[train_index], y_train[train_index])
    scores.append(classifier.score(x_train_bow[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores))

### SVM

#### Without bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train), total=4):
    classifier = SVC(kernel='rbf')
    classifier.fit(x_train[train_index], y_train[train_index])
    scores.append(classifier.score(x_train[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores))

#### With bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train_bow), total=4):
    classifier = SVC(kernel='rbf')
    classifier.fit(x_train_bow[train_index], y_train[train_index])
    scores.append(classifier.score(x_train_bow[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores))

### Random forest

#### Without bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train), total=4):
    classifier = RandomForestRegressor(n_estimators=50)
    classifier.fit(x_train[train_index], y_train[train_index])
    scores.append(classifier.score(x_train[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores))

#### With bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train_bow), total=4):
    classifier = RandomForestRegressor(n_estimators=50)
    classifier.fit(x_train_bow[train_index], y_train[train_index])
    scores.append(classifier.score(x_train_bow[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores))

### Boosted Trees

#### Without bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train), total=4):
    classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.)
    classifier.fit(x_train[train_index], y_train[train_index])
    scores.append(classifier.score(x_train[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores))

#### With bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train_bow), total=4):
    classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.)
    classifier.fit(x_train_bow[train_index], y_train[train_index])
    scores.append(classifier.score(x_train_bow[test_index], y_train[test_index]))
    
np.mean(np.asarray(scores))

### Fully connected network

#### Data preprocessing

In [8]:
# Converting to one_hot
def one_hot(x):
    classes = np.asarray([-1, 0, 1])
    array = np.zeros((*x.shape, classes.shape[0]), dtype=np.int)
    for i, classe in enumerate(classes):
        vector = np.zeros((1,classes.shape[0]), dtype=np.int)
        vector[:,i]=1
        array[x==classe] = vector
    return array

y_train_oh = one_hot(y_train)

#### Model creation

In [15]:
def FCN(x_train, y_train):
    model = Sequential()
    model.add(InputLayer(input_shape=(x_train.shape[1])))
    model.add(Dense(units=32,
                    activation='elu',
                    kernel_initializer='lecun_normal'))
    model.add(Dropout(rate=0.4))
    
    model.add(Dense(units=16,
                    activation='elu',
                    kernel_initializer='lecun_normal'))
    model.add(Dropout(rate=0.4))
    
    model.add(Dense(units=y_train.shape[1], 
                    activation='softmax', 
                    kernel_initializer='lecun_normal'))
    
    METRICS = [CategoricalAccuracy(name='accuracy'),
               Precision(name='precision'),
               Recall(name='recall'),
               AUC(name='auc')]

    optimizer = Adam(lr=1e-5)
    model.compile(optimizer=optimizer,
                 loss='categorical_crossentropy',
                 metrics=METRICS)
    model.summary()
    
    earlystop = EarlyStopping(monitor='val_loss',
                             patience=15,
                             restore_best_weights=True)
    reduceLR = ReduceLROnPlateau(monitor='val_loss',
                                factor=np.sqrt(1e-1),
                                verbose=1,
                                patience=5)
    history = model.fit(x=x_train,
                        y=y_train,
                        batch_size=32,
                        epochs=10000,
                        verbose=0,
                        callbacks=[earlystop, reduceLR],
                        validation_split=0.3,
                        shuffle=True,
                        workers=4)
    
    return model

#### Without bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train_bow), total=4):
    model = FCN(x_train[train_index], y_train_oh[train_index])
    scores.append(model.evaluate(x_train[test_index], y_train_oh[test_index]))
                
print(np.mean(np.asarray(scores), axis=0))

#### With bag of words

In [None]:
kf = KFold(n_splits=4, shuffle=True)
scores = []
for train_index, test_index in tqdm(kf.split(x_train_bow), total=4):
    model = FCN(x_train_bow[train_index], y_train_oh[train_index])
    scores.append(model.evaluate(x_train_bow[test_index], y_train_oh[test_index]))
    
print(np.mean(np.asarray(scores), axis=0))

### LSTM network

#### Data preprocessing

In [10]:
max_features = 2048 # around number of unigrams in the data
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df_airlines.text.values)

x_train_seq = tokenizer.texts_to_sequences(df_airlines.text.values)
x_train_seq = pad_sequences(x_train_seq)

print(x_train_seq.shape)

(14604, 21)


#### Model creation

In [14]:
def RNN(x_train, y_train):
    model = Sequential()
    model.add(Embedding(max_features, 256, embeddings_initializer='lecun_normal'))
    model.add(SpatialDropout1D(rate=0.4))
    model.add(Bidirectional(LSTM(units=64, dropout=0.4, recurrent_dropout=0.4)))
    
    model.add(Dense(units=y_train.shape[1], 
                    activation='softmax', 
                    kernel_initializer='lecun_normal'))
    
    METRICS = [CategoricalAccuracy(name='accuracy'),
               Precision(name='precision'),
               Recall(name='recall'),
               AUC(name='auc')]

    optimizer = Adam(lr=1e-5)
    model.compile(optimizer=optimizer,
                 loss='categorical_crossentropy',
                 metrics=METRICS)
    model.summary()
    
    earlystop = EarlyStopping(monitor='val_loss',
                             patience=15,
                             restore_best_weights=True)
    reduceLR = ReduceLROnPlateau(monitor='val_loss',
                                factor=np.sqrt(1e-1),
                                verbose=1,
                                patience=5)
    history = model.fit(x=x_train,
                        y=y_train,
                        batch_size=32,
                        epochs=10000,
                        verbose=0,
                        callbacks=[earlystop, reduceLR],
                        validation_split=0.3,
                        shuffle=True,
                        workers=4)
    
    return model

#### Macro parameters exploration

In [None]:
kf = KFold(n_splits=4, shuffle=True) 
all_scores = []
scores = []
for train_index, test_index in tqdm(kf.split(x_train_seq), total=4):
    model = RNN(x_train_seq[train_index], y_train_oh[train_index])
    scores.append(model.evaluate(x_train_seq[test_index], y_train_oh[test_index]))

all_scores.append(np.mean(np.asarray(scores), axis=0))
        
print(all_scores)

## End of comparison of models

## Evaluation on test datasets

In [46]:
def adapt(y):
    array = np.zeros(y.shape, dtype=np.int)
    for i, row in enumerate(y):
        array[i][np.argmax(row)]=1
    return array

def evaluate(dataframe, classifier, preprocessing_x, preprocessing_y=None, NN=False):
    size = len(dataframe)
    scores = []
    n = size//100000
    for k in range(n):
        y_true = dataframe.classification.values[k*100000:(k+1)*100000]
        if preprocessing_y:
            y_true = preprocessing_y(y_true)
            
        if NN:
            scores.append(model.evaluate(preprocessing_x(dataframe.text.values[k*100000:(k+1)*100000]), y_true, verbose=0)[1:])
        
        else:
            y_pred = classifier.predict(preprocessing_x(dataframe.text.values[k*100000:(k+1)*100000]))
            scores.append([accuracy_score(y_true, y_pred), 
                           precision_score(y_true, y_pred, average='weighted'),
                           recall_score(y_true, y_pred, average='weighted', zero_division=0)])
        
    
    
    # last values
    y_true = dataframe.classification.values[(k+1)*100000:]
    if preprocessing_y:
        y_true = preprocessing_y(y_true)
    
    if NN:
        scores.append(model.evaluate(preprocessing_x(dataframe.text.values[(k+1)*100000:]), y_true, verbose=0)[1:])
        
    else:
        y_pred = classifier.predict(preprocessing_x(dataframe.text.values[(k+1)*100000:]))
        scores.append([accuracy_score(y_true, y_pred), 
                       precision_score(y_true, y_pred, average='weighted'),
                       recall_score(y_true, y_pred, average='weighted', zero_division=0)])
        
    return np.mean(np.asarray(scores), axis=0)

def get_scores(y_true, y_pred):
    return [accuracy_score(y_true, y_pred), precision_score(y_true, y_pred, average='weighted'),recall_score(y_true, y_pred, average='weighted')]





In [47]:
vectorizer = TfidfVectorizer(min_df=10,
                             norm='l2',
                             ngram_range=(1,1))
vectorizer_bow = TfidfVectorizer(min_df=10,
                                 norm='l2',
                                 ngram_range=(1,3))

max_features = 2048 # around number of unigrams in the data
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df_airlines.text.values)

x_train = vectorizer.fit_transform(df_airlines.text.values).toarray()
x_train_bow = vectorizer_bow.fit_transform(df_airlines.text.values).toarray()
x_train_seq = pad_sequences(tokenizer.texts_to_sequences(df_airlines.text.values))
y_train = df_airlines.classification.values
y_train_oh = one_hot(y_train)

In [None]:
amazon_nb = []
amazon_svm = []
amazon_adaboost = []
amazon_ann = []
amazon_rnn = []

sentiment_nb = []
sentiment_svm = []
sentiment_adaboost = []
sentiment_ann = []
sentiment_rnn = []

airlines_nb = []
airlines_svm = []
airlines_adaboost = []
airlines_ann = []
airlines_rnn = []

kf = KFold(n_splits=4, shuffle=True)
for k in range(5): #needs to average the results
    for train_index, test_index in tqdm(kf.split(x_train), total=4):
        indexes_amazon = np.random.choice(a=len(df_amazon), size=len(df_amazon), replace=False)
        indexes_sentiment = np.random.choice(a=len(df_sentiment), size=len(df_sentiment), replace=False)
        
        print('Naive Bayes')
        # Naive Bayes
        classifier = MultinomialNB()
        classifier.fit(x_train[train_index], y_train[train_index])
        
        print('evaluation')
        amazon_nb.append(evaluate(df_amazon.iloc[indexes_amazon], classifier, lambda x:vectorizer.transform(x).toarray()))
        sentiment_nb.append(evaluate(df_sentiment.iloc[indexes_sentiment], classifier, lambda x:vectorizer.transform(x).toarray()))
        airlines_nb.append(get_scores(y_train[test_index], classifier.predict(x_train[test_index])))
        
#         print('SVM')
#         # SVM
#         classifier = SVC(kernel='rbf')
#         classifier.fit(x_train[train_index], y_train[train_index])
        
#         print('evaluation sentiment')
#         sentiment_svm.append(evaluate(df_sentiment.iloc[indexes_sentiment], classifier, lambda x:vectorizer.transform(x).toarray()))
#         print('evaluation amazon')
#         amazon_svm.append(evaluate(df_amazon.iloc[indexes_amazon], classifier, lambda x:vectorizer.transform(x).toarray()))
#         print('evaluation airlines')
#         airlines_svm.append(get_scores(y_train[test_index], classifier.predict(x_train[test_index])))
        
        print('AdaBoost')
        # AdaBoost
        classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.)
        classifier.fit(x_train[train_index], y_train[train_index])
        
        print('evaluation sentiment')
        sentiment_adaboost.append(evaluate(df_sentiment.iloc[indexes_sentiment], classifier, lambda x:vectorizer.transform(x).toarray()))
        print('evaluation amazon')
        amazon_adaboost.append(evaluate(df_amazon.iloc[indexes_amazon], classifier, lambda x:vectorizer.transform(x).toarray()))
        print('evaluation airlines')
        airlines_adaboost.append(get_scores(y_train[test_index], classifier.predict(x_train[test_index])))
        
        print('ANN')
        # ANN
        model = FCN(x_train[train_index], y_train_oh[train_index])
                
        print('evaluation sentiment')
        sentiment_ann.append(evaluate(df_sentiment.iloc[indexes_sentiment], model, lambda x:vectorizer.transform(x).toarray(), one_hot, NN=True))
        print('evaluation amazon')
        amazon_ann.append(evaluate(df_amazon.iloc[indexes_amazon], model, lambda x:vectorizer.transform(x).toarray(), one_hot, NN=True))
        print('evaluation airlines')
        airlines_ann.append(model.evaluate(x_train[test_index], y_train_oh[test_index])[1:])
        
        print('RNN')
        # RNN
        model = RNN(x_train_seq[train_index], y_train_oh[train_index])
                
        print('evaluation sentiment')
        sentiment_rnn.append(model.evaluate(df_sentiment.iloc[indexes_sentiment], model, lambda x:pad_sequences(tokenizer.texts_to_sequences(x)), one_hot, NN=True))
        print('evaluation amazon')
        amazon_rnn.append(model.evaluate(df_amazon.iloc[indexes_amazon], model, lambda x:pad_sequences(tokenizer.texts_to_sequences(x)), one_hot, NN=True))
        print('evaluation airlines')
        airlines_rnn.append(model.evaluate(x_train_seq[test_index], y_train_oh[test_index])[1:])
            
amazon = pd.DataFrame({'NB':amazon_nb, 'SVM':amazon_svm, 'AdaBoost':amazon_adaboost, 'ANN':amazon_ann, 'RNN':amazon_rnn})
amazon.to_csv('amazon_scores.csv')

sentiment = pd.DataFrame({'NB':sentiment_nb, 'SVM':sentiment_svm, 'AdaBoost':sentiment_adaboost, 'ANN':sentiment_ann, 'RNN':sentiment_rnn})
sentiment.to_csv('sentiment_scores.csv')         

airlines = pd.DataFrame({'NB':airlines_nb, 'SVM':airlines_svm, 'AdaBoost':airlines_adaboost, 'ANN':airlines_ann, 'RNN':airlines_rnn})
airlines.to_csv('airlines_scores.csv')

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Naive Bayes
evaluation
AdaBoost
evaluation sentiment
evaluation amazon
evaluation airlines
ANN
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 32)                48736     
_________________________________________________________________
dropout_8 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 16)                528       
_________________________________________________________________
dropout_9 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 3)                 51        
Total params: 49,315
Trainable params: 49,315
Non-trainable params: 0
_________________________________________________________________

Epoch 00199: ReduceL

In [None]:
x_train = x_train_bow

amazon_nb = []
amazon_svm = []
amazon_adaboost = []
amazon_ann = []
amazon_rnn = []

sentiment_nb = []
sentiment_svm = []
sentiment_adaboost = []
sentiment_ann = []
sentiment_rnn = []

airlines_nb = []
airlines_svm = []
airlines_adaboost = []
airlines_ann = []
airlines_rnn = []

kf = KFold(n_splits=4, shuffle=True)
for k in range(5): #needs to average the results
    for train_index, test_index in tqdm(kf.split(x_train), total=4):
        indexes_amazon = np.random.choice(a=len(df_amazon), size=len(df_amazon), replace=False)
        indexes_sentiment = np.random.choice(a=len(df_sentiment), size=len(df_sentiment), replace=False)
        
        # Naive Bayes
        classifier = MultinomialNB()
        classifier.fit(x_train[train_index], y_train[train_index])
            
        sentiment_nb.append(evaluate(df_sentiment[indexes_sentiment], classifier, vectorizer_bow.transform))
        amazon_nb.append(evaluate(df_amazon[indexes_amazon], classifier, vectorizer_bow.transform))
        airlines_nb.append(get_scores(y_train[test_index], classifier.predict(x_train[test_index])))
        # SVM
        classifier = SVC(kernel='rbf')
        classifier.fit(x_train[train_index], y_train[train_index])
            
        sentiment_svm.append(evaluate(df_sentiment[indexes_sentiment], classifier, vectorizer_bow.transform))
        amazon_svm.append(evaluate(df_amazon[indexes_amazon], classifier, vectorizer_bow.transform))
        airlines_svm.append(get_scores(y_train[test_index], classifier.predict(x_train[test_index])))
        # AdaBoost
        classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.)
        classifier.fit(x_train[train_index], y_train[train_index])
            
        sentiment_adaboost.append(evaluate(df_sentiment[indexes_sentiment], classifier, vectorizer_bow.transform))
        amazon_adaboost.append(evaluate(df_amazon[indexes_amazon], classifier, vectorizer_bow.transform))
        airlines_adaboost.append(get_scores(y_train[test_index], classifier.predict(x_train[test_index])))
        
        # ANN
        model = FCN(x_train[train_index], y_train_oh[train_index])
        sentiment_ann.append(evaluate(df_sentiment, model, vectorizer_bow.transform, one_hot))
        amazon_ann.append(evaluate(df_amazon, model, vectorizer_bow.transform, one_hot))
        airlines_ann.append(model.evaluate(x_train[test_index], y_train_oh[test_index])[1:])
        
amazon = pd.DataFrame({'NB':amazon_nb, 'SVM':amazon_svm, 'AdaBoost':amazon_adaboost, 'ANN':amazon_ann, 'RNN':amazon_rnn})
amazon.to_csv('amazon_scores_bow.csv')

sentiment = pd.DataFrame({'NB':sentiment_nb, 'SVM':sentiment_svm, 'AdaBoost':sentiment_adaboost, 'ANN':sentiment_ann, 'RNN':sentiment_rnn})
sentiment.to_csv('sentiment_scores_bow.csv')         

airlines = pd.DataFrame({'NB':airlines_nb, 'SVM':airlines_svm, 'AdaBoost':airlines_adaboost, 'ANN':airlines_ann, 'RNN':airlines_rnn})
airlines.to_csv('airlines_scores_bow.csv')

In [4]:
df_amazon_scores=pd.read_csv('amazon_scores.csv')
df_sentiment_scores=pd.read_csv('sentiment_scores.csv')
df_airlines_scores=pd.read_csv('airlines_scores.csv')

df_amazon_scores.head()

Unnamed: 0.1,Unnamed: 0,NB,AdaBoost,ANN,RNN
0,0,[0.37908058 0.72377017 0.37908058],[0.38163466 0.69889358 0.38163466],[0.3414714 0.33778468 0.27975348 0.47843286],[0.39540252 0.3956134 0.34360558 0.5118499 ]
1,1,[0.39865833 0.71795989 0.39865833],[0.32480262 0.71555708 0.32480262],[0.3578629 0.36236638 0.27961123 0.48089302],[0.42398864 0.4248059 0.3662462 0.5405918 ]
2,2,[0.35288581 0.72538366 0.35288581],[0.31620354 0.70879469 0.31620354],[0.32674506 0.31738183 0.26180407 0.4595451 ],[0.38192508 0.38320768 0.30238578 0.49419576]
3,3,[0.32262483 0.72892243 0.32262483],[0.28804131 0.72233032 0.28804131],[0.25516164 0.24041915 0.20474453 0.40125924],[0.28835207 0.26595458 0.20429069 0.4157634 ]


In [23]:
def clear(list_of_strings):
    total = []
    for string in list_of_strings:
        splitted = string.split(' ')
        splitted[0] = splitted[0][1:]
        splitted[-1] = splitted[-1][:-1]
        
        for k in range(len(splitted)):
            splitted[k] = float(splitted[k])
        
        total.append(splitted)
    return total

classifiers_avg = np.zeros((4,4))
np.asarray(df_amazon_scores.NB.values)

NB = np.mean(clear(np.asarray(df_amazon_scores.NB.values)), axis=0)
ADA = np.mean(clear(np.asarray(df_amazon_scores.AdaBoost.values)), axis=0)
ANN = np.mean(clear(np.asarray(df_amazon_scores.ANN.values)), axis=0)
RNN = np.mean(clear(np.asarray(df_amazon_scores.RNN.values)), axis=0)

plt.figure()
plt.bar([0,1,2,3], NB)
plt.show()

ValueError: could not convert string to float: 

In [21]:
np.asarray(df_amazon_scores.ANN.values)

array(['[0.3414714  0.33778468 0.27975348 0.47843286]',
       '[0.3578629  0.36236638 0.27961123 0.48089302]',
       '[0.32674506 0.31738183 0.26180407 0.4595451 ]',
       '[0.25516164 0.24041915 0.20474453 0.40125924]'], dtype=object)