In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection  import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
%matplotlib inline

# Objective

This data set from Kaggle contains tweets about COVID19 vaccines with lists of Hashtags. The goal of this project is classifying these tweets to distinguish health-oriented, political and superstition based on NLP algorithms.

In order to achieve this, we follow three steps:

    1.	Text processing and data preparation: in this step we format the data, delete GDPR related information, extract links from tweets and checking some data features
    
    2.	Assigning categories: this is semi-manual process which plays a role in the prediction quality. We have to classify the tweets directly one-by-one, which is a manual time-consuming process but will generate higher precision the end.
        
    
    Approval                  | 207 | % 6.95 
    Business                  | 47  | % 1.58 
    Health                    | 108 | % 3.63 
    Junk Knowledge            | 194 | % 6.51 
    Other                     | 168 | % 5.64 
    Politics                  | 123 | % 4.13 
    Procurement and Logistics | 202 | % 6.78 
    Unique vaccination        | 619 | % 20.79 
    Vaccination campaign      | 312 | % 10.48 
    Vaccine                   | 287 | % 9.64 
    Side effects              | 117 | % 3.93 

    Unclassified              | 594 | %19.95 

    DISCLAIMER: these classifications are the result of a manual effort whithout medical proper knowledge. Medical professionals     may have a different opinion.

    3.	Once categories are assigned, an NLP vectorizer is used to predict the tweets.

At the end we will have a classifier to predict whether the tweet has real informative value in regards to COVID19 and helps to eliminate all junk-science or any other irrelevant information.


## 1. Data Pre-processing

In [2]:
# Read data
twitter_data = pd.read_excel('vaccination_tweets.xlsx',sheet_name='vaccination_tweets')

In a first steps all countries were normalised as a potential feature, for the moment the geographical location is not used:

In [3]:
twitter_data['Country'].fillna('NA',inplace=True)
nb_countries = len(twitter_data['Country'].unique())
freq=pd.DataFrame(twitter_data['Country'].value_counts())
freq.reset_index(inplace=True)
freq.columns = ['Country','Country_freq']
twitter_data  = twitter_data.merge(freq,how='left',on='Country')

The text contains both the tweet and the link to this tweet, we separate both in order to (1) remove noise made by the link (2) make it easier later to check the tweets:

In [4]:
def link_extractor(text):
    if '¦' in text:
        return text.split('¦')[1].strip()
    else:
        return ""

def text_extractor(text):
    if '¦' in text:
        return text.split('¦')[0].strip()
    else:
        return text.strip()

twitter_data['link'] = twitter_data['text'].apply(lambda t:link_extractor(t))
#twitter_data['text'] = twitter_data['text'].apply(lambda t:text_extractor(t))

The table contains data related to the users. As we do not need these columns, it is better to get rid of these data for privacy reasons:

In [5]:
#GDPR filters
del twitter_data['id']
del twitter_data['user_name']
del twitter_data['user_description']

## 2. Assigning Categories

The second step is reading all manually classified values and inserting them into a new column.
To date, 60% of the tweets are classified.

In [6]:
cat = pd.DataFrame(pd.read_excel('vaccination_tweets_processed.xlsx',sheet_name='data processed')['Category'].fillna(''),columns =['Category'])
twitter_data = pd.concat([twitter_data,cat],axis=1)

PermissionError: [Errno 13] Permission denied: 'vaccination_tweets_processed.xlsx'

In [None]:
twitter_data['Category'].unique()

## 3. NLP Classification

The firstr step is filtering out unclassified or undesired categories:

In [None]:
# Deleting Unclassified and 'Other' tweets
twitter_data_sample = twitter_data.copy()
twitter_data_sample = twitter_data_sample[twitter_data_sample['Category'] != '']
twitter_data_sample = twitter_data_sample[twitter_data_sample['Category'] != 'Other']
twitter_data_sample = twitter_data_sample[twitter_data_sample['Category'] != 'Business']
twitter_data_sample = twitter_data_sample[twitter_data_sample['Category'] != 'Politics']
twitter_data_sample.reset_index(inplace = True)
del twitter_data_sample['index']
twitter_data_sample

In [None]:
def text_cleaner(text):
    """ This function aims at removing links, tagged names and and '#' characters from the text values as they
    do not have real value and add noise to the text
    """
    new_text = []
    text = text.split(' ')
    for element in text:
        add_text = True
        if 'https' in element:
            add_text = False
        if '@' in element:
            add_text = False
        if '#' in element:
            add_text = False
        if add_text:
            new_text.append(element)
    return ' '.join(new_text)

twitter_data_sample['text'] = twitter_data_sample['text'].apply(lambda s:text_cleaner(s))
twitter_data_sample

Defining text filtering function

In [None]:
def text_process(mess):
    """
    1. remove punctuation
    2. remove stop words
    3. return list of cleantext words
    """
    
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

Importing NLP libraries:

In [None]:
import nltk
#nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
import string
from nltk.corpus import stopwords
bow_transformer = CountVectorizer(analyzer = text_process).fit(twitter_data_sample['text'])

Creating the IfIDF matrix

In [None]:
messages_bow = bow_transformer.transform(twitter_data_sample['text'])
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)

Finally, we predict the tweets Category using a MultinomialNB classifier

In [None]:
spam_detect_model = MultinomialNB().fit(messages_tfidf,twitter_data_sample['Category'])
msg_train,msg_test,label_train,label_test = train_test_split(twitter_data_sample['text'],twitter_data_sample['Category'],test_size=0.3)
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer = text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',MultinomialNB())
])
pipeline.fit(msg_train,label_train)
precitions = pipeline.predict(msg_test)
print(classification_report(label_test,precitions))


The 'Unique Vaccination' category has lower results than the others, which is due to the fact that a considerable number of these tweets are rather pictures after the vaccination than clear text.

### Some predictions

In [None]:
def predict_value(i):
    """This function takes an index at the table and predict its text category."""
    text = text_cleaner(twitter_data['text'][i])
    cat = twitter_data['Category'][i]
    if cat =="":
        cat = 'Unclassified'
    predict = pipeline.predict([text])[0]
    print("Text:\n----------------------\n{}\n----------------------\nCategory: {}\nPredicted: {}".format(text,cat,predict))

#### Predicting element 50

In [None]:
predict_value(50)

#### Predicting element 500

In [None]:
predict_value(400)

#### Predicting element 1000

In [None]:
predict_value(1000)

#### Predicting element 1500

In [None]:
predict_value(1500)

#### Predicting element 2000

In [None]:
predict_value(1800)