In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import emoji

In [3]:
# Analysis with sample data
data = 'sample.csv'
subsample = pd.read_csv(data, lineterminator='\n')
tweet = subsample[['text', 'country_code']].copy()

In [5]:
countries = {'IN': 'India', 'IR': 'Iran', 'NG': 'Nigeria', 'UG': 'Uganda',
             'GM': 'Gambia', 'LY': 'Libya', 'PK': 'Pakistan', 'ZW': 'Zimbabwe',
             'CA': 'Canada', 'GB': 'United Kingdom', 'FI': 'Finland', 'NO': 'Norway',
             'IE': 'Ireland', 'FR': 'France', 'AU': 'Australia', 'SG': 'Singapore'}
tweet['country'] = tweet['country_code'].apply(lambda x: countries[x])

In [6]:
# Preprocess based on hugging face instructions
# source: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if "http" in t else t
        new_text.append(t)
    return " ".join(new_text)

In [7]:
tweet['new_text'] = tweet.text.apply(func = preprocess)

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

In [2]:
Roberta = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(Roberta)
model = AutoModelForSequenceClassification.from_pretrained(Roberta)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
# source: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
# source: https://medium.com/mlearning-ai/elon-musks-twitter-sentiment-analysis-with-transformers-hugging-face-roberta-49b9e61b1433
# source: https://github.com/huggingface/transformers/issues/16746
def sentiment_score(tweets):
    try:
        tokens = tokenizer.encode(tweets, return_tensors='pt', truncation=True, max_length = 511)
        output = model(tokens)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        return scores
    except RuntimeError:
        pass

In [None]:
tweet['sentiment'] = tweet['new_text'].apply(func = sentiment_score)
tweet['sentiment_classification'] = tweet['sentiment'].apply(np.argmax)

# Exploratary Analysis of Sentiment

In [None]:
# average sentiment score by country
avg_by_country = tweet['sentiment_classification'].groupby(tweet['country_code', 'country']).mean().to_frame()
avg_by_country = avg_by_country.reset_index(level=0)

In [None]:
sns.set(font_scale=2)
sns.barplot(x = 'country_code',
            y = 'sentiment_classification',
            palette = 'colorblind',
            data = avg_by_country)

In [None]:
# Classification count by country and classification
count_by_country = tweet.groupby(['sentiment_classification','country_code', 'country'], as_index=False).size()
count_by_country = count_by_country.sort_values(by = ['sentiment_classification', 'size'], ascending = False)

In [None]:
sns.catplot(x = 'country_code',
            y = 'size',
            col = 'sentiment_classification',
            kind = 'bar',
            palette = 'colorblind',
            data = count_by_country
)

# Predicting Peacefulness with Sentiment Analysis

In [6]:
# Prediction with random forest / logistic regression
from random import sample
full = 'sample.csv'
tweets = pd.read_csv(full, lineterminator='\n')
text = tweets[['text', 'country_code']].copy()

In [None]:
# Repeated sampling 300 data points for each country and calculate their corresponding statistics
rframe = []
rframe1 = []
rframe2 = []
for i in range (20):
    sample = text.groupby('country_code', group_keys = False).apply(lambda x: x.sample(300))
    text = pd.concat([text, sample]).drop_duplicates(keep = False)
    sample['new_text'] = sample.text.apply(func = preprocess)
    sample['sentiment'] = sample['new_text'].apply(func = sentiment_score)
    sample['sentiment_classification'] = sample['sentiment'].apply(np.argmax)
    rframe.append(sample)
    s_avg_by_country = sample['sentiment_classification'].groupby(sample['country_code']).mean().to_frame().reset_index(level=0)
    s_count_by_country = sample.groupby(['sentiment_classification','country_code'], as_index = False).size()
    s_count_by_country = s_count_by_country.pivot(index = 'country_code', columns = 'sentiment_classification', values='size').rename_axis(None, axis=1)
    s_count_by_country = (s_count_by_country/300).reset_index(level = 0)
    rframe1.append(s_avg_by_country)
    rframe2.append(s_count_by_country)

In [None]:
result = pd.concat(rframe)
result1 = pd.concat(rframe1)
result2 = pd.concat(rframe2)

In [None]:
peaceful = ['AU', 'CA', 'FI', 'FR', 'GB', 'IE', 'NO', 'SG']
result1['peacefulness'] = result1.country_code.apply(lambda x: 1 if (x in peaceful) else 0)
result2['peacefulness'] = result2.country_code.apply(lambda x: 1 if (x in peaceful) else 0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix
x = result1['sentiment_classification'].to_frame()
y = result1['peacefulness']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30)
param_grid = {
    'max_depth': [int(x) for x in np.linspace(10, 100, num = 10)],
    'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
}

hyper_tuning = GridSearchCV(estimator = RandomForestClassifier(), 
                            param_grid = param_grid,
                            cv = 5)

hyper_tuning.fit(x_train, y_train)
hyper_tuning.best_params_

In [None]:
rf = RandomForestClassifier(max_depth = 50, n_estimators = 200, n_jobs = -1)
rf.fit(x_train, y_train)
yhat = rf.predict(x_test)
print(rf.score(x_train, y_train))
print(rf.score(x_test, y_test))
print(f1_score(y_test, yhat))
plot_confusion_matrix(rf, x_test, y_test)

In [None]:
x_p = result2[['0', '2']]
y_p = result2['peacefulness']
xp_train, xp_test, yp_train, yp_test = train_test_split(x_p, y_p, test_size = 0.30)

hyper_tuning.fit(xp_train, yp_train)
hyper_tuning.best_params_

In [None]:
rfp = RandomForestClassifier(max_depth = 10, n_estimators = 1400, n_jobs = -1)
rfp.fit(xp_train, yp_train)
yphat = rfp.predict(xp_test)
print(rfp.score(xp_train, yp_train))
print(rfp.score(xp_test, yp_test))
print(f1_score(yp_test, yphat))
plot_confusion_matrix(rfp, xp_test, yp_test)

In [None]:
# logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
print(logreg.coef_, logreg.intercept_)
print(logreg.score(x_train, y_train))
print(logreg.score(x_test, y_test))
plot_confusion_matrix(logreg, x_test, y_test)

logreg.fit(xp_train, yp_train)
print(logreg.coef_, logreg.intercept_)
print(logreg.score(xp_train, yp_train))
print(logreg.score(xp_test, yp_test))
plot_confusion_matrix(logreg, xp_test, yp_test)