# Import Libraries

In [118]:
from cleaning_tweets import *
import pandas as pd
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

# Data Preprocessing

### Coronavirus Dataset

https://www.kaggle.com/datasets/datatattle/covid-19-nlp-text-classification

In [119]:
# import the tweets datasets
coronavirus_tweets_nlp_train = pd.read_csv('tweets dataset/Coronavirus tweets NLP - Text Classification/tweets_train.csv', encoding="ISO-8859-1")
coronavirus_tweets_nlp_test = pd.read_csv('tweets dataset/Coronavirus tweets NLP - Text Classification/tweets_test.csv', encoding="ISO-8859-1")

In [120]:
# check for missing values in text and label columns
print(coronavirus_tweets_nlp_train.isnull().sum())
print('\n')
print(coronavirus_tweets_nlp_test.isnull().sum())

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64


UserName           0
ScreenName         0
Location         834
TweetAt            0
OriginalTweet      0
Sentiment          0
dtype: int64


In [121]:
texts_new_train = []
for t in coronavirus_tweets_nlp_train.OriginalTweet:
    texts_new_train.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(emoji.demojize(t))))))

# Add the cleaned tweets to the dataframe
coronavirus_tweets_nlp_train['cleaned_tweets'] = texts_new_train

In [122]:
texts_new_test = []
for t in coronavirus_tweets_nlp_test.OriginalTweet:
    texts_new_test.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(emoji.demojize(t))))))

# Add the cleaned tweets to the dataframe
coronavirus_tweets_nlp_test['cleaned_tweets'] = texts_new_test

In [123]:
coronavirus_tweets_nlp_train['Sentiment'] = coronavirus_tweets_nlp_train['Sentiment'].map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})
coronavirus_tweets_nlp_test['Sentiment'] = coronavirus_tweets_nlp_test['Sentiment'].map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})

### IT Dataset

https://github.com/charlesmalafosse/open-dataset-for-sentiment-analysis/blob/62425b270bcf7561b7a6f7821a09f5bf522a798f/betsentiment-IT-tweets-sentiment-teams-split.zip.001

In [124]:
it_df = pd.read_csv('tweets dataset/Open IT Dataset for Sentiment Analysis/betsentiment-IT-tweets-sentiment-players.csv', encoding='ISO-8859-1')

In [125]:
# check for missing values in text and label columns
print(it_df.isnull().sum())
print('\n')

tweet_date_created    0
tweet_id              0
tweet_text            0
language              0
sentiment             0
sentiment_score       0
dtype: int64




In [126]:
texts_new_train = []
for t in it_df.tweet_text:
    texts_new_train.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(emoji.demojize(t))))))

# Add the cleaned tweets to the dataframe
it_df['cleaned_tweets'] = texts_new_train

In [127]:
it_df['sentiment'] = it_df['sentiment'].map({'NEGATIVE':0,'NEUTRAL':1,'MIXED':1,'POSITIVE':2})

In [128]:
# split the df into train and test
it_df_train, it_df_test = train_test_split(it_df, test_size=0.2, random_state=42)

### ChatGPT Dataset

https://www.kaggle.com/datasets/charunisa/chatgpt-sentiment-analysis

In [129]:
chatgpt_df = pd.read_csv('tweets dataset/ChatGPT sentiment analysis/file.csv')
chatgpt_df = chatgpt_df[['tweets', 'labels']]

In [130]:
# check for missing values in text and label columns
print(chatgpt_df.isnull().sum())
print('\n')

tweets    0
labels    0
dtype: int64




In [131]:
texts_new_train = []
for t in chatgpt_df.tweets:
    texts_new_train.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(emoji.demojize(t))))))

# Add the cleaned tweets to the dataframe
chatgpt_df['cleaned_tweets'] = texts_new_train

In [132]:
chatgpt_df['labels'] = chatgpt_df['labels'].map({'bad':0,'neutral':1,'good':2})

In [133]:
# split the df into train and test
chatgpt_df_train, chatgpt_df_test = train_test_split(chatgpt_df, test_size=0.2, random_state=42)

### Overview of the datasets

In [134]:
# print total number of tweets in each dataset
print('Number of tweets in the Covid19 dataset:', len(coronavirus_tweets_nlp_train))
print('Number of tweets in the Covid19 test dataset:', len(coronavirus_tweets_nlp_test))
print('Number of tweets in each class in the Covid19 training dataset:')
print(coronavirus_tweets_nlp_train['Sentiment'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%')
print('\n')
print('Number of tweets in the Italian dataset:', len(it_df_train))
print('Number of tweets in the Italian test dataset:', len(it_df_test))
print('Number of tweets in each class in the Italian training set:')
print(it_df_train['sentiment'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%')
print('\n')
print('Number of tweets in the ChatGPT dataset:', len(chatgpt_df_train))
print('Number of tweets in the ChatGPT test dataset:', len(chatgpt_df_test))
print('Number of tweets in each class in the ChatGPT training set:')
print(chatgpt_df_train['labels'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%')



Number of tweets in the Covid19 dataset: 41157
Number of tweets in the Covid19 test dataset: 3798
Number of tweets in each class in the Covid19 training dataset:
2    43.8%
0    37.4%
1    18.7%
Name: Sentiment, dtype: object


Number of tweets in the Italian dataset: 132652
Number of tweets in the Italian test dataset: 33163
Number of tweets in each class in the Italian training set:
1    81.9%
2    14.2%
0     3.9%
Name: sentiment, dtype: object


Number of tweets in the ChatGPT dataset: 175435
Number of tweets in the ChatGPT test dataset: 43859
Number of tweets in each class in the ChatGPT training set:
0    49.2%
2    25.5%
1    25.3%
Name: labels, dtype: object


# **PassiveAggressive**

### Coronavirus Dataset

In [135]:
# create x_train, y_train, x_test, y_test
coronavirus_tweets_nlp_x_train = coronavirus_tweets_nlp_train['cleaned_tweets']
coronavirus_tweets_nlp_y_train = coronavirus_tweets_nlp_train['Sentiment']
coronavirus_tweets_nlp_x_test = coronavirus_tweets_nlp_test['cleaned_tweets']
coronavirus_tweets_nlp_y_test = coronavirus_tweets_nlp_test['Sentiment']

In [136]:
label = ['0', '1', '2']

In [137]:
# initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)

# fit and transform train set, transform test set
coronavirus_tweets_nlp_tfidf_train = tfidf_vectorizer.fit_transform(coronavirus_tweets_nlp_x_train)
coronavirus_tweets_nlp_tfidf_test = tfidf_vectorizer.transform(coronavirus_tweets_nlp_x_test)

In [138]:
# initialize a PassiveAggressiveClassifier and fit the model
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(coronavirus_tweets_nlp_tfidf_train, coronavirus_tweets_nlp_y_train)
y_pred = pac.predict(coronavirus_tweets_nlp_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(coronavirus_tweets_nlp_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(coronavirus_tweets_nlp_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(coronavirus_tweets_nlp_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(coronavirus_tweets_nlp_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.74
Precision: 0.74
Recall: 0.74

Confusion Matrix:
[[1219  164  250]
 [ 146  356  117]
 [ 206  101 1239]]




### IT Dataset

In [139]:
# create x_train, y_train, x_test, y_test
it_x_train = it_df_train['cleaned_tweets']
it_y_train = it_df_train['sentiment']
it_x_test = it_df_test['cleaned_tweets']
it_y_test = it_df_test['sentiment']

In [140]:
label = ['0', '1', '2']

In [141]:
# initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)

# fit and transform train set, transform test set
it_tfidf_train = tfidf_vectorizer.fit_transform(it_x_train)
it_tfidf_test = tfidf_vectorizer.transform(it_x_test)

In [142]:
# initialize a PassiveAggressiveClassifier and fit the model
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(it_tfidf_train, it_y_train)
y_pred = pac.predict(it_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(it_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(it_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(it_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(it_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.82
Precision: 0.81
Recall: 0.82

Confusion Matrix:
[[  350   877    93]
 [  541 24441  2142]
 [   71  2303  2345]]


### ChatGPT Dataset

In [143]:
# create x_train, y_train, x_test, y_test
chatgpt_x_train = chatgpt_df_train['cleaned_tweets']
chatgpt_y_train = chatgpt_df_train['labels']
chatgpt_x_test = chatgpt_df_test['cleaned_tweets']
chatgpt_y_test = chatgpt_df_test['labels']

In [144]:
label = ['0', '1', '2']

In [145]:
# initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)

# fit and transform train set, transform test set
chatgpt_tfidf_train = tfidf_vectorizer.fit_transform(chatgpt_x_train)
chatgpt_tfidf_test = tfidf_vectorizer.transform(chatgpt_x_test)

In [146]:
# initialize a PassiveAggressiveClassifier and fit the model
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(chatgpt_tfidf_train, chatgpt_y_train)
y_pred = pac.predict(chatgpt_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(chatgpt_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(chatgpt_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(chatgpt_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(chatgpt_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.80
Precision: 0.80
Recall: 0.80

Confusion Matrix:
[[19671  1543   260]
 [ 2576  6776  1829]
 [  433  2036  8735]]


# **Logistic Regression**

### Coronavirus Dataset

In [147]:
# inizialite logistic regression and fit the model
logreg = LogisticRegression()
logreg.fit(coronavirus_tweets_nlp_tfidf_train, coronavirus_tweets_nlp_y_train)
y_pred = logreg.predict(coronavirus_tweets_nlp_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(coronavirus_tweets_nlp_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(coronavirus_tweets_nlp_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(coronavirus_tweets_nlp_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(coronavirus_tweets_nlp_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.77
Precision: 0.77
Recall: 0.77

Confusion Matrix:
[[1289  100  244]
 [ 155  353  111]
 [ 193   60 1293]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### IT Dataset

In [148]:
# initialize a logistic regression and fit the model
logreg = LogisticRegression()
logreg.fit(it_tfidf_train, it_y_train)
y_pred = logreg.predict(it_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(it_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(it_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(it_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(it_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.85
Precision: 0.84
Recall: 0.85

Confusion Matrix:
[[  194  1107    19]
 [   99 26144   881]
 [   10  2745  1964]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### ChatGPT Dataset

In [149]:
# initialize a logistic regression and fit the model
logreg = LogisticRegression()
logreg.fit(chatgpt_tfidf_train, chatgpt_y_train)
y_pred = logreg.predict(chatgpt_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(chatgpt_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(chatgpt_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(chatgpt_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(chatgpt_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.83
Precision: 0.82
Recall: 0.83

Confusion Matrix:
[[20364   839   271]
 [ 2529  6937  1715]
 [  581  1558  9065]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# **Random Forrest**

### Coronavirus Dataset

In [150]:
# inizialite random forrest and fit the model
rf = RandomForestClassifier()
rf.fit(coronavirus_tweets_nlp_tfidf_train, coronavirus_tweets_nlp_y_train)
y_pred = rf.predict(coronavirus_tweets_nlp_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(coronavirus_tweets_nlp_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(coronavirus_tweets_nlp_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(coronavirus_tweets_nlp_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(coronavirus_tweets_nlp_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.71
Precision: 0.71
Recall: 0.71

Confusion Matrix:
[[1145  151  337]
 [ 146  349  124]
 [ 253  103 1190]]


### IT Dataset

In [151]:
#initialize a random forrest and fit the model
rf = RandomForestClassifier()
rf.fit(it_tfidf_train, it_y_train)
y_pred = rf.predict(it_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(it_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(it_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(it_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(it_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.84
Precision: 0.82
Recall: 0.84

Confusion Matrix:
[[   29  1287     4]
 [   20 26646   458]
 [    0  3551  1168]]


### ChatGPT Dataset

In [152]:
#initialize a random forrest and fit the model
rf = RandomForestClassifier()
rf.fit(chatgpt_tfidf_train, chatgpt_y_train)
y_pred = rf.predict(chatgpt_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(chatgpt_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(chatgpt_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(chatgpt_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(chatgpt_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.80
Precision: 0.79
Recall: 0.80

Confusion Matrix:
[[19852   973   649]
 [ 2818  6718  1645]
 [  870  1991  8343]]


# **Support Vector Classifier**

### Coronavirus Dataset

In [153]:
# inizialite svc and fit the model
svc = LinearSVC()
svc.fit(coronavirus_tweets_nlp_tfidf_train, coronavirus_tweets_nlp_y_train)
y_pred = svc.predict(coronavirus_tweets_nlp_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(coronavirus_tweets_nlp_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(coronavirus_tweets_nlp_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(coronavirus_tweets_nlp_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(coronavirus_tweets_nlp_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.78
Precision: 0.78
Recall: 0.78

Confusion Matrix:
[[1302  117  214]
 [ 143  380   96]
 [ 186   64 1296]]


### IT Dataset

In [154]:
#initialize svc and fit the model
svc = LinearSVC()
svc.fit(it_tfidf_train, it_y_train)
y_pred = svc.predict(it_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(it_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(it_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(it_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(it_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.85
Precision: 0.83
Recall: 0.85

Confusion Matrix:
[[  234  1044    42]
 [  141 25899  1084]
 [   14  2612  2093]]


### ChatGPT Dataset

In [155]:
#initialize svc and fit the model
svc = LinearSVC()
svc.fit(chatgpt_tfidf_train, chatgpt_y_train)
y_pred = svc.predict(chatgpt_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(chatgpt_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(chatgpt_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(chatgpt_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(chatgpt_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.82
Precision: 0.82
Recall: 0.82

Confusion Matrix:
[[20477   692   305]
 [ 2609  6223  2349]
 [  410  1329  9465]]


# **XGBoost**

### Coronavirus Dataset

In [156]:
# inizialite XGB and fit the model
xgb = XGBClassifier()
xgb.fit(coronavirus_tweets_nlp_tfidf_train, coronavirus_tweets_nlp_y_train)
y_pred = xgb.predict(coronavirus_tweets_nlp_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(coronavirus_tweets_nlp_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(coronavirus_tweets_nlp_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(coronavirus_tweets_nlp_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(coronavirus_tweets_nlp_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.73
Precision: 0.74
Recall: 0.73

Confusion Matrix:
[[1150  185  298]
 [  95  430   94]
 [ 219  125 1202]]


### IT Dataset

In [157]:
# initialize XGB and fit the model
xgb = XGBClassifier()
xgb.fit(it_tfidf_train, it_y_train)
y_pred = xgb.predict(it_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(it_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(it_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(it_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(it_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.85
Precision: 0.83
Recall: 0.85

Confusion Matrix:
[[  124  1180    16]
 [   87 26341   696]
 [   13  2997  1709]]


### ChatGPT Dataset

In [158]:
# initialize XGB and fit the model
xgb = XGBClassifier()
xgb.fit(chatgpt_tfidf_train, chatgpt_y_train)
y_pred = xgb.predict(chatgpt_tfidf_test)

# print the classification report
print(f'Accuracy: {accuracy_score(chatgpt_y_test, y_pred):.2f}')
print(f'Precision: {precision_score(chatgpt_y_test, y_pred, average="weighted"):.2f}')
print(f'Recall: {recall_score(chatgpt_y_test, y_pred, average="weighted"):.2f}\n')
print('Confusion Matrix:')
print(confusion_matrix(chatgpt_y_test, y_pred, labels=[0, 1, 2]))

Accuracy: 0.74
Precision: 0.73
Recall: 0.74

Confusion Matrix:
[[20188   693   593]
 [ 4947  4794  1440]
 [ 1945  1795  7464]]
