Initialize notebook

In [1]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive/')

drive_folder = '/content/drive/My Drive/PAi 15/'

# Import packages

import pandas as pd
import numpy as np
import re
from random import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

import sys
sys.path.insert(0, drive_folder + 'Data part')
from read_file import dataframe_from_mult_files
from sentiment_file_manager import save_sentiments

Mounted at /content/drive/
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Useful functions

In [2]:
def shuffle_list(*ls):
    l =list(zip(*ls))
    shuffle(l)
    return zip(*l)

def text_preprocessing(s):
    """
    - Lowercase the sentence
    - Change "'t" to "not"
    - Remove "@name"
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    """
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

Load training set

In [None]:
# trainset = pd.read_csv(drive_folder + '/Sentiment/train.tsv.zip', sep="\t")

# rev0=trainset['Phrase']
# sentiment0=trainset["Sentiment"]
# #y=to_categorical(movie_reviews.Sentiment.values)
# print("Nombre total de Reviews :"+ str(len(sentiment0)))

# rev,sentiment=shuffle_list(rev0,sentiment0)

# # On re-equilible la base de données : on doit remove 35000 'neutral'
# n=79582-35000
# k=0
# y0=[]
# reviews0=[]
# for i in range(len(sentiment)):
#   el=sentiment[i]
#   if el==0 : 
#     y0.append(0)
#     reviews0.append(rev[i])
#   if el==1 :
#     y0.append(1)
#     reviews0.append(rev[i])
#   if el==2 and k<n:
#     k+=1 
#     y0.append(2)
#     reviews0.append(rev[i])
#   if el==3 :
#     y0.append(3)
#     reviews0.append(rev[i])
#   if el==4 :
#     y0.append(4)
#     reviews0.append(rev[i])

# y_train=np.array(y0,dtype='float32')
# X_train=np.array(reviews0)

# # Réduction de 5 classes à 3 classes.
# temp_train=[]
# for el in y_train :
#   if el<= 1 :
#     temp_train.append(0)
#   elif el == 2 :
#     temp_train.append(1)
#   else :
#     temp_train.append(2)
# y_train=np.array(temp_train,dtype='int')

Nombre total de Reviews :156060


In [3]:
tweet_reviews = pd.read_csv(drive_folder + 'Dataset/Données externes/Tweets-Flight.csv')
rev=np.array(tweet_reviews['text'])
sentiment=tweet_reviews['airline_sentiment']
y0=[]
reviews0=[]

# On re-equilible la base de données : on doit remove 5500 'neutral'
n=9178-5500
k=0

for i in range(len(sentiment)):
  el=sentiment[i]
  if el=='negative' and k<n:
    k+=1 
    y0.append(0)
    reviews0.append(rev[i])

  if el=='neutral': 
    y0.append(1)
    reviews0.append(rev[i])

  if el=='positive': 
    y0.append(2)
    reviews0.append(rev[i])

y_train=np.array(y0,dtype='float32')
X_train=np.array(reviews0)

Preprocess training set

In [4]:
# Preprocess text
X_train_preprocessed = np.array([text_preprocessing(text) for text in X_train])

# Calculate TF-IDF
tf_idf = TfidfVectorizer(ngram_range=(1, 3),
                         binary=True,
                         smooth_idf=False)
X_train_tfidf = tf_idf.fit_transform(X_train_preprocessed)

Fit a Mutinomial Naive Bayes model

In [5]:
best_alpha = 0.3
multi_nb_model = MultinomialNB(alpha=best_alpha)
multi_nb_model.fit(X_train_tfidf, y_train)

MultinomialNB(alpha=0.3, class_prior=None, fit_prior=True)

Load tweets

In [6]:
tweets_filename = 'stream_tweets_Week4_181k'
# tweets_filename = 'All_tweets_231k'
tweets = dataframe_from_mult_files([drive_folder + 'Dataset/' + tweets_filename + '.csv'])

Preprocess tweets

In [7]:
tweets_preprocessed = np.array([text_preprocessing(text) for text in tweets['text']])
tweets_tfidf = tf_idf.transform(tweets_preprocessed)

Get predictions from Naive Bayes

In [8]:
sentiments = multi_nb_model.predict_proba(tweets_tfidf).argmax(1) - 1

Save predictions

In [9]:
save_sentiments(
    sentiments = sentiments,
    folder_name = drive_folder + 'Dataset/Sentiments/',
    tweets_filename = tweets_filename,
    model_name = 'Naive Bayes'
)