In [2]:
from numpy import array
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import random
import re
import string
from nltk import classify
from nltk.stem.snowball import SnowballStemmer
import pickle
import matplotlib.pyplot as plt
import numpy as np

# Model loading
I will be using the Logistic Regression Classifier since it's the most accurate and most consistent (88%)

In [3]:
# Loading model with pickle
LRClassifier = "..\\Models\\LRClassifier.pkl"
with open(LRClassifier, 'rb') as file:
    LRClassifier = pickle.load(file)

In [4]:
# Importing tweets from january
tweets = pd.read_csv('..\\Datasets\\JSON\\January\\dataCleaned.csv', usecols=[
    'data', 'text', 'date'], engine='python')

# Preprocessing
* Tokenization
* Stemmatization
* Removal of italian stopwords
* Removal of punctuation

In [5]:
# Italian stopwords
stop_words = stopwords.words('italian')

# Italian Stemmer
stemmer = SnowballStemmer('italian')


# Additional stopwords found online
def additional_stop_words():
    with open('Training\\stopwords.txt', 'r') as f:
        additional_stopwords = f.readlines()
    additional_stopwords = [x.strip() for x in additional_stopwords]
    return additional_stopwords


# Function to remove noise from tokens, removing also stopwords
def remove_noise(tweet_tokens, stop_words=(), additional_stop_words=()):
    cleaned_tokens = []
    for token in tweet_tokens:
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
        token = re.sub("(@[A-Za-z0-9_]+)", "", token)
        token = stemmer.stem(token)
        if len(token) > 1 and token not in string.punctuation and token.lower() not in stop_words and token.lower() not in additional_stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [6]:
# Adding tweets and tokenized from january to list
january = tweets.data.values.tolist()
tokenized = tweets.text.values.tolist()
date = tweets.date.values.tolist()
# List of classified tweets from january
classified = []

# Analysis

In [7]:
# For each tweet, remove noise and tokenize, calculate the accuracy of a given prediction
# and add to classified list
for tweet in january:
    custom_tokens = remove_noise(word_tokenize(tweet))
    classified.append(tuple((tweet, LRClassifier.prob_classify(
        dict([token, True] for token in custom_tokens)).prob('Positive'), LRClassifier.prob_classify(
        dict([token, True] for token in custom_tokens)).prob('Negative'))))

In [8]:
# Creating dataframe from classified tweets
df = pd.DataFrame(classified, columns=['tweet', 'positive', 'negative'])

# Obtain polarity by subtracting positives with negatives values
df['polarity'] = df['positive'] - df['negative']

# Adding tokenized column
df['tokenized'] = tokenized

# Adding date column
df['date'] = date

# Reordering columns
df = df[['date', 'tweet', 'tokenized', 'positive', 'negative', 'polarity']]

In [9]:
# Quick glance at the dataframe
print(df.head(10))

                        date  \
0  2020-01-30 23:59:55+00:00   
1  2020-01-30 23:59:50+00:00   
2  2020-01-30 23:59:49+00:00   
3  2020-01-30 23:59:35+00:00   
4  2020-01-30 23:59:25+00:00   
5  2020-01-30 23:59:17+00:00   
6  2020-01-30 23:59:12+00:00   
7  2020-01-30 23:59:10+00:00   
8  2020-01-30 23:59:05+00:00   
9  2020-01-30 23:58:58+00:00   

                                               tweet  \
0  piazzapulita Formigli lo posso dire Stasera st...   
1  Due casi confermati in Italia una coppia di tu...   
2  Anche stavolta azzecchiamo il trend topic doma...   
3  Mannoni raccomanda di informarsi al meglio sul...   
4  Coronavirus a Roma due turisti cinesi infetti ...   
5  dovresti vedere la trasmissione ti scoppierebb...   
6  GiuseppeConteIT noi Italiani non siamo stupidi...   
7  Detto questo voglio ricordare che attualmente ...   
8        sulla gestione della situazione coronavirus   
9  Lineanotte niente allarmismi Salvo parlare da ...   

                              

In [18]:
print("Most recurrent polarity: ", df.polarity.mode())

Most recurrent polarity:  0    0.154542
dtype: float64


In [13]:
# Saving dataframe to csv
df.to_csv('..\\Datasets\\CSV\\january_analyzed.csv')