In [10]:
from numpy import array
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import random
import re
import string
from nltk import classify
from nltk.stem.snowball import SnowballStemmer
import pickle
import matplotlib.pyplot as plt
import numpy as np

# Model loading
I will be using the Logistic Regression Classifier since it's the most accurate (87.9%)

In [11]:
# Loading model with pickle
LRClassifier = "..\\Models\\LRClassifier.pkl"
with open(LRClassifier, 'rb') as file:
    LRClassifier = pickle.load(file)

In [12]:
# Importing tweets from April
tweets = pd.read_csv('..\\Datasets\\JSON\\April\\dataCleaned.csv', usecols=[
    'data', 'text', 'date'], engine='python')

# Preprocessing
* Tokenization
* Stemmatization
* Removal of italian stopwords
* Removal of punctuation

In [13]:
# Italian stopwords
stop_words = stopwords.words('italian')

# Italian Stemmer
stemmer = SnowballStemmer('italian')


# Additional stopwords found online
def additional_stop_words():
    with open('Training\\stopwords.txt', 'r') as f:
        additional_stopwords = f.readlines()
    additional_stopwords = [x.strip() for x in additional_stopwords]
    return additional_stopwords


# Function to remove noise from tokens, removing also stopwords
def remove_noise(tweet_tokens, stop_words=(), additional_stop_words=()):
    cleaned_tokens = []
    for token in tweet_tokens:
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
        token = re.sub("(@[A-Za-z0-9_]+)", "", token)
        token = stemmer.stem(token)
        if len(token) > 1 and token not in string.punctuation and token.lower() not in stop_words and token.lower() not in additional_stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [14]:
# Adding tweets and tokenized from april to list
april = tweets.data.values.tolist()
tokenized = tweets.text.values.tolist()
date = tweets.date.values.tolist()
# List of classified tweets from april
classified = []

# Analysis


In [15]:
# For each tweet, remove noise and tokenize, calculate the accuracy of a given prediction
# and add to classified list
for tweet in april:
    custom_tokens = remove_noise(word_tokenize(tweet))
    classified.append(tuple((tweet, LRClassifier.prob_classify(
        dict([token, True] for token in custom_tokens)).prob('Positive'), LRClassifier.prob_classify(
        dict([token, True] for token in custom_tokens)).prob('Negative'))))

In [16]:
# creating dataframe from classified tweets
df = pd.DataFrame(classified, columns=['tweet', 'positive', 'negative'])

# Obtain polarity by subtracting positives with negatives values
df['polarity'] = df['positive'] - df['negative']

# Adding tokenized column
df['tokenized'] = tokenized

# Adding date column
df['date'] = date

# Reordering columns
df = df[['date', 'tweet', 'tokenized', 'positive', 'negative', 'polarity']]

In [17]:
# Quick glance at the dataframe
print(df.head(10))

                        date  \
0  2020-04-30 23:59:54+00:00   
1  2020-04-30 23:59:34+00:00   
2  2020-04-30 23:59:08+00:00   
3  2020-04-30 23:58:44+00:00   
4  2020-04-30 23:58:19+00:00   
5  2020-04-30 23:57:01+00:00   
6  2020-04-30 23:56:23+00:00   
7  2020-04-30 23:56:13+00:00   
8  2020-04-30 23:55:06+00:00   
9  2020-04-30 23:54:53+00:00   

                                               tweet  \
0  Usa si fermano allevamentiintensivi e impianti...   
1  Coronavirus Speranza firma decreto criteri mon...   
2  La Lega occupa il Parlamento a rischio Coronav...   
3  gb boris johnson ampquotSuperato il picco di e...   
4  Coronavirus il bilancio del 30 aprile record d...   
5  Coronavirus pi di 230mila morti nel mondo Posi...   
6  COVID19 Esiste una teoria piuttosto seria seco...   
7  Secondo tito boeri da cui ilsole24ore ha ripre...   
8  La Lega occupa il Parlamento a rischio Coronav...   
9  Coronavirus dagli assembramenti agli orari di ...   

                              

In [19]:
# Saving dataframe to csv
df.to_csv('..\\Datasets\\CSV\\april_analyzed.csv')