In [1]:
from numpy import array
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import random
import re
import string
from nltk import classify
from nltk.stem.snowball import SnowballStemmer
import pickle
import matplotlib.pyplot as plt
import numpy as np

# Model loading
I will be using the Logistic Regression Classifier since it's the most accurate (87.9%)

In [2]:
# Loading model with pickle
LRClassifier = "..\\Models\\LRClassifier.pkl"
with open(LRClassifier, 'rb') as file:
    LRClassifier = pickle.load(file)

In [3]:
# Importing tweets from March
march_tweets = pd.read_csv('..\\Datasets\\JSON\\March\\dataCleaned.csv', usecols=[
    'data', 'text', 'date'], engine='python')

# Preprocessing
* Tokenization
* Stemmatization
* Removal of italian stopwords
* Removal of punctuation

In [4]:
# Italian stopwords
stop_words = stopwords.words('italian')

# Italian Stemmer
stemmer = SnowballStemmer('italian')


# Additional stopwords found online
def additional_stop_words():
    with open('Training\\stopwords.txt', 'r') as f:
        additional_stopwords = f.readlines()
    additional_stopwords = [x.strip() for x in additional_stopwords]
    return additional_stopwords


# Function to remove noise from tokens, removing also stopwords
def remove_noise(tweet_tokens, stop_words=(), additional_stop_words=()):
    cleaned_tokens = []
    for token in tweet_tokens:
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
        token = re.sub("(@[A-Za-z0-9_]+)", "", token)
        token = stemmer.stem(token)
        if len(token) > 1 and token not in string.punctuation and token.lower() not in stop_words and token.lower() not in additional_stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [5]:
# Adding tweets and tokenized from march to list
march = march_tweets.data.values.tolist()
tokenized = march_tweets.text.values.tolist()
date = march_tweets.date.values.tolist()
# List of classified tweets from march
classified = []

# Analysis

In [6]:
# For each tweet, remove noise and tokenize, calculate the accuracy of a given prediction
# and add to classified list
for tweet in march:
    custom_tokens = remove_noise(word_tokenize(tweet))
    classified.append(tuple((tweet, LRClassifier.prob_classify(
        dict([token, True] for token in custom_tokens)).prob('Positive'), LRClassifier.prob_classify(
        dict([token, True] for token in custom_tokens)).prob('Negative'))))

In [7]:
# creating dataframe from classified tweets
df = pd.DataFrame(classified, columns=['tweet', 'positive', 'negative'])

# Obtain polarity by subtracting positives with negatives values
df['polarity'] = df['positive'] - df['negative']

# Adding tokenized column
df['tokenized'] = tokenized

# Adding date column
df['date'] = date

# Reordering columns
df = df[['date', 'tweet', 'tokenized', 'positive', 'negative', 'polarity']]

In [8]:
# Quick glance at the dataframe
print(df.head(10))

                        date  \
0  2020-03-01 23:59:55+00:00   
1  2020-03-01 23:59:54+00:00   
2  2020-03-01 23:59:54+00:00   
3  2020-03-01 23:59:35+00:00   
4  2020-03-01 23:59:31+00:00   
5  2020-03-01 23:59:15+00:00   
6  2020-03-01 23:58:40+00:00   
7  2020-03-01 23:58:26+00:00   
8  2020-03-01 23:58:05+00:00   
9  2020-03-01 23:57:47+00:00   

                                               tweet  \
0  Cazzarola ora tutto pi chiaro Altro che cinesi...   
1  566 nuovi casi dallinizio di domenicalive none...   
2  coronavirus scuole conte campania ULTIMISSIMA ...   
3  Il commento di HoaraBorselli contro AdrianoPan...   
4  MotoGP 2020 Coronavirus Cancellati i primi due...   
5  amore ai tempi del corona virus mia figlia mi ...   
6  Ho messo un aggregato dei dati disponibili su ...   
7  Ecco le istruzioni per creare il vaccino del c...   
8                           Coronavirus buone regole   
9  VIDEO Il Coronavirus monopolizza l informazion...   

                              

In [9]:
# Saving dataframe to csv
df.to_csv('..\\Datasets\\CSV\\april_analyzed.csv')