In [1]:
from numpy import array
from sklearn.datasets import make_blobs
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
import random
import re
import string
from nltk import classify
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC
from nltk import NaiveBayesClassifier
from nltk.classify import ClassifierI
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from nltk.stem.snowball import SnowballStemmer
import pickle
from statistics import mode
import matplotlib.pyplot as plt
import numpy as np

### Importing datasets

In [2]:
# Positive tweets from dataset
positive = pd.read_csv('..\\Datasets\\Training Datasets\\positive.csv', usecols=[
    'tweet_text', 'sentiment'], engine='python')

# Negative tweets from dataset
negative = pd.read_csv('..\\Datasets\\Training Datasets\\negative.csv', usecols=[
    'tweet_text', 'sentiment'], engine='python')

print("positive dataset dimension: ", len(positive))
print("negative dataset dimension: ", len(negative))

positive dataset dimension:  5500
negative dataset dimension:  5500


# Preprocessing
* Tokenization
* Stemmatization
* Removal of italian stopwords
* Removal of punctuation

## Tokenization

In [3]:
# Tokenizing positive, negative and text
positive_tokens = positive['tweet_text'].apply(word_tokenize)
negative_tokens = negative['tweet_text'].apply(word_tokenize)

## Stopwords and punctuation removal and Stemmatization

In [4]:
# Italian stopwords
stop_words = stopwords.words('italian')

# Italian Stemmer
stemmer = SnowballStemmer('italian')


# Additional stopwords found online
def additional_stop_words():
    with open('..\\Training\\stopwords.txt', 'r') as f:
        additional_stopwords = f.readlines()
    additional_stopwords = [x.strip() for x in additional_stopwords]
    return additional_stopwords


# Function to remove noise from tokens, removing also stopwords
def remove_noise(tweet_tokens, stop_words=(), additional_stop_words=()):
    cleaned_tokens = []
    for token in tweet_tokens:
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
        token = re.sub("(@[A-Za-z0-9_]+)", "", token)
        token = stemmer.stem(token)
        if len(token) > 3 and token not in string.punctuation and token.lower() not in stop_words and token.lower() not in additional_stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [5]:
# Lists of positive and negative cleaned tokens
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

# Cleaning positive tokens and adding to list
for tokens in positive_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

# Cleaning negative tokens and adding to list
for tokens in negative_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [6]:
# Defining a generator function that takes a list of tweets as an argument and
# provides a list of words in all of the tweet tokens joined.
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

In [7]:
# Positive words
every_positive_word = get_all_words(positive_cleaned_tokens_list)

# Negative words
every_negative_word = get_all_words(negative_cleaned_tokens_list)

In [8]:
# What are the most positive words and how frequent are they?
freq_dist_positive = FreqDist(every_positive_word)
print(freq_dist_positive.most_common(10))

freq_dist_negative = FreqDist(every_negative_word)
print(freq_dist_negative.most_common(10))

[('grand', 1088), ('graz', 745), ('sempr', 727), ('buon', 492), ('augur', 385), ('bell', 353), ('campion', 320), ('forz', 302), ('tutt', 302), ('brav', 277)]
[('cazz', 587), ('merd', 507), ('vergogn', 479), ('part', 398), ('gioc', 386), ('fatt', 384), ('stat', 361), ('sempr', 354), ('tifos', 334), ('inter', 326)]


# Preparation for training

In [9]:
# Converts a list of cleaned tokens to dictionaries
# token as the key and True as values
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

In [10]:
# Preparing data for training
positive_tokens_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_model = get_tweets_for_model(negative_cleaned_tokens_list)

# Attach label Positive or Negative to each tweet
positive_dataset = [(tweet_dict, 'Positive')
                    for tweet_dict in positive_tokens_model]
negative_dataset = [(tweet_dict, 'Negative')
                    for tweet_dict in negative_tokens_model]


In [12]:
# Create dataset by joining positive and negative
dataset = positive_dataset + negative_dataset

# Shuffle the dataset to avoid bias
random.shuffle(dataset)

# Separating training to test data 70/30
train_data = dataset[:9000]
test_data = dataset[9000:]

# Training with different algorithms
* Naive Bayes
* Logistic Regression
* Bernoulli Naive Bayes
* Multinomial Naibe Bayes
* Stochastic Gradient Descent
* Support Vector Classification
* NuSVC
* LinearSVC

### Naive Bayes

In [13]:
# Naive Bayes classifier
NaiveBayes = NaiveBayesClassifier.train(train_data)

print("Classifier accuracy percent:",
      (classify.accuracy(NaiveBayes, test_data))*100, '%')

Classifier accuracy percent: 84.0 %


In [14]:
# Save model as file for later usage
filename = '..\\Models\\NaiveBayes.pkl'
with open(filename, 'wb') as file:
    pickle.dump(NaiveBayes, file)

### Logistic Regression

In [15]:
# LogisticRegression classifier
LRClassifier = SklearnClassifier(LogisticRegression())
LRClassifier.train(train_data)

print("Classifier accuracy percent:",
      (classify.accuracy(LRClassifier, test_data))*100, '%')

Classifier accuracy percent: 88.0 %


In [16]:
# Save model as file for later usage
filename = '..\\Models\\LRClassifier.pkl'
with open(filename, 'wb') as file:
    pickle.dump(LRClassifier, file)

### Bernoulli Naive Bayes

In [17]:
# BernoulliNB classifier
BernoulliNB = SklearnClassifier(BernoulliNB())
BernoulliNB.train(train_data)

print("Classifier accuracy percent:",
      (classify.accuracy(BernoulliNB, test_data))*100, '%')

Classifier accuracy percent: 85.0 %


In [18]:
# Save model as file for later usage
filename = '..\\Models\\BernoulliNB.pkl'
with open(filename, 'wb') as file:
    pickle.dump(BernoulliNB, file)

### Multinomial Naive Bayes

In [19]:
# MultinomialNB classifier
MultinomialNB = SklearnClassifier(MultinomialNB())
MultinomialNB.train(train_data)

print("Classifier accuracy percent:",
      (classify.accuracy(MultinomialNB, test_data))*100, '%')

Classifier accuracy percent: 86.5 %


In [20]:
# Save model as file for later usage
filename = '..\\Models\\MultinomialNB.pkl'
with open(filename, 'wb') as file:
    pickle.dump(MultinomialNB, file)

### Stochastic Gradient Descent

In [21]:
# SGDClassifier classifier
SGDClassifier = SklearnClassifier(SGDClassifier())
SGDClassifier.train(train_data)

print("Classifier accuracy percent:",
      (classify.accuracy(SGDClassifier, test_data))*100, '%')

Classifier accuracy percent: 87.25 %


In [22]:
# Save model as file for later usage
filename = '..\\Models\\SGDClassifier.pkl'
with open(filename, 'wb') as file:
    pickle.dump(SGDClassifier, file)

### Support Vector Classification

In [23]:
# SVC classifier
SVC = SklearnClassifier(SVC())
SVC.train(train_data)

print("Classifier accuracy percent:",
      (classify.accuracy(SVC, test_data))*100, '%')

Classifier accuracy percent: 88.1 %


In [24]:
# Save model as file for later usage
filename = '..\\Models\\SVC.pkl'
with open(filename, 'wb') as file:
    pickle.dump(SVC, file)

### NuSVC

In [25]:
# NuSVC
NuSVC = SklearnClassifier(NuSVC())
NuSVC.train(train_data)

print("Classifier accuracy percent:",
      (classify.accuracy(NuSVC, test_data))*100, '%')

Classifier accuracy percent: 87.94999999999999 %


In [26]:
# Save model as file for later usage
filename = '..\\Models\\NuSVC.pkl'
with open(filename, 'wb') as file:
    pickle.dump(NuSVC, file)

### LinearSVC

In [27]:
# LinearSVC
LinearSVC = SklearnClassifier(LinearSVC())
LinearSVC.train(train_data)

print("Classifier accuracy percent:",
      (classify.accuracy(LinearSVC, test_data))*100, '%')

Classifier accuracy percent: 86.0 %


In [28]:
# Save model as file for later usage
filename = '..\\Models\\LinearSVC.pkl'
with open(filename, 'wb') as file:
    pickle.dump(LinearSVC, file)

## Ensemble Model
This model combines the predictions from each model and uses the majority vote as the final prediction

In [29]:
# Defininig the ensemble model class 

class EnsembleClassifier(ClassifierI):
    
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    
    # returns the classification based on majority of votes
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

In [30]:
# Load all classifiers from the pickled files

# function to load models given filepath
def load_model(file_path): 
    classifier_f = open(file_path, "rb")
    classifier = pickle.load(classifier_f)
    classifier_f.close()
    return classifier

In [31]:
# Using an odd number of models to avoid the chance of a tie

# Original Naive Bayes Classifier
NB_clf = load_model('..\\Models\\NaiveBayes.pkl')

# BernoulliNB
BernoulliNB_clf = load_model('..\\Models\\BernoulliNB.pkl')

# Multinomial Naive Bayes Classifier 
MNB_Clf = load_model('..\\Models\\MultinomialNB.pkl')

# SVC Classifier 
SVC_Clf = load_model('..\\Models\\SVC.pkl')

# Logistic Regression Classifier 
LogReg_Clf = load_model('..\\Models\\LRClassifier.pkl')

# Stochastic Gradient Descent Classifier
SGD_Clf = load_model('..\\Models\\MultinomialNB.pkl')

# NuSVC
NuSVC_clf = load_model('..\\Models\\NuSVC.pkl')

# LienarSVC
LinearSVC_clf = load_model('..\\Models\\LinearSVC.pkl')

In [32]:
# Initializing the ensemble classifier 
Ensemble = EnsembleClassifier(NB_clf, MNB_Clf, SVC_Clf, LogReg_Clf, SGD_Clf, NuSVC_clf, LinearSVC_clf)

In [33]:
# Testing new classifier

print("Classifier accuracy percent:",
      (classify.accuracy(Ensemble, test_data))*100, '%')

Classifier accuracy percent: 88.6 %


In [34]:
# Save model as file for later usage
filename = '..\\Models\\Ensemble.pkl'
with open(filename, 'wb') as file:
    pickle.dump(Ensemble, file)

### The prediciton has improved by 0.5% using the Ensemble model