In [109]:
import pandas as pd 
import numpy as np
import nltk
import random
import re
from bs4 import BeautifulSoup
import emoji

nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Yakina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Yakina\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yakina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [110]:
data = pd.read_csv('datasets/twitter_training.csv')
data

Unnamed: 0,id,company,sentiment,text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [111]:
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [112]:
text = pd.DataFrame()
text['text'] = data['text']
text['sentiment'] = data['sentiment']
text

Unnamed: 0,text,sentiment
0,I mentioned on Facebook that I was struggling ...,Irrelevant
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral
2,@Microsoft Why do I pay for WORD when it funct...,Negative
3,"CSGO matchmaking is so full of closet hacking,...",Negative
4,Now the President is slapping Americans in the...,Neutral
...,...,...
995,⭐️ Toronto is the arts and culture capital of ...,Irrelevant
996,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,Irrelevant
997,Today sucked so it’s time to drink wine n play...,Positive
998,Bought a fraction of Microsoft today. Small wins.,Positive


Augmentation


In [113]:
from nltk.corpus import wordnet

def synonym_replacement(sentence, n=1):
    words = sentence.split()
    new_sentence = words.copy()
    
    for _ in range(n):
        word_idx = random.randint(0, len(words) - 1)
        synonyms = wordnet.synsets(words[word_idx])
        
        if synonyms:
            new_sentence[word_idx] = synonyms[0].lemmas()[0].name()
    
    return " ".join(new_sentence)

text["text"].apply(lambda x: synonym_replacement(x))
# text

0      I mentioned on Facebook that I was fight for m...
1      BBC News - Amazon boss Jeff Bezos rejects clai...
2      @Microsoft Why do I pay for WORD when it funct...
3      CSGO matchmaking is so full of closet hacking,...
4      Now the President is slapping Americans in the...
                             ...                        
995    ⭐️ Toronto is the arts and culture capital of ...
996    tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997    today sucked so it’s time to drink wine n play...
998    Bought a fraction of Microsoft today. Small wins.
999    Johnson & Johnson to stop selling talc baby po...
Name: text, Length: 1000, dtype: object

Tokenization

In [114]:
from nltk.tokenize import word_tokenize

nltk.download('punkt')

text["tokens"] = text["text"].apply(lambda x: word_tokenize(x.lower()))
text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yakina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,sentiment,tokens
0,I mentioned on Facebook that I was struggling ...,Irrelevant,"[i, mentioned, on, facebook, that, i, was, str..."
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral,"[bbc, news, -, amazon, boss, jeff, bezos, reje..."
2,@Microsoft Why do I pay for WORD when it funct...,Negative,"[@, microsoft, why, do, i, pay, for, word, whe..."
3,"CSGO matchmaking is so full of closet hacking,...",Negative,"[csgo, matchmaking, is, so, full, of, closet, ..."
4,Now the President is slapping Americans in the...,Neutral,"[now, the, president, is, slapping, americans,..."
...,...,...,...
995,⭐️ Toronto is the arts and culture capital of ...,Irrelevant,"[⭐️, toronto, is, the, arts, and, culture, cap..."
996,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,Irrelevant,"[this, is, actually, a, good, move, tot, bring..."
997,Today sucked so it’s time to drink wine n play...,Positive,"[today, sucked, so, it, ’, s, time, to, drink,..."
998,Bought a fraction of Microsoft today. Small wins.,Positive,"[bought, a, fraction, of, microsoft, today, .,..."


Stemming and Lemmatization

In [115]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

text["stemmed"] = text["tokens"].apply(lambda x: [stemmer.stem(word) for word in x])

text

Unnamed: 0,text,sentiment,tokens,stemmed
0,I mentioned on Facebook that I was struggling ...,Irrelevant,"[i, mentioned, on, facebook, that, i, was, str...","[i, mention, on, facebook, that, i, wa, strugg..."
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral,"[bbc, news, -, amazon, boss, jeff, bezos, reje...","[bbc, news, -, amazon, boss, jeff, bezo, rejec..."
2,@Microsoft Why do I pay for WORD when it funct...,Negative,"[@, microsoft, why, do, i, pay, for, word, whe...","[@, microsoft, whi, do, i, pay, for, word, whe..."
3,"CSGO matchmaking is so full of closet hacking,...",Negative,"[csgo, matchmaking, is, so, full, of, closet, ...","[csgo, matchmak, is, so, full, of, closet, hac..."
4,Now the President is slapping Americans in the...,Neutral,"[now, the, president, is, slapping, americans,...","[now, the, presid, is, slap, american, in, the..."
...,...,...,...,...
995,⭐️ Toronto is the arts and culture capital of ...,Irrelevant,"[⭐️, toronto, is, the, arts, and, culture, cap...","[⭐️, toronto, is, the, art, and, cultur, capit..."
996,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,Irrelevant,"[this, is, actually, a, good, move, tot, bring...","[thi, is, actual, a, good, move, tot, bring, m..."
997,Today sucked so it’s time to drink wine n play...,Positive,"[today, sucked, so, it, ’, s, time, to, drink,...","[today, suck, so, it, ’, s, time, to, drink, w..."
998,Bought a fraction of Microsoft today. Small wins.,Positive,"[bought, a, fraction, of, microsoft, today, .,...","[bought, a, fraction, of, microsoft, today, .,..."


In [116]:
from nltk.stem import WordNetLemmatizer
lemmatizer= WordNetLemmatizer()

text['lemmatization'] = text['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
text

Unnamed: 0,text,sentiment,tokens,stemmed,lemmatization
0,I mentioned on Facebook that I was struggling ...,Irrelevant,"[i, mentioned, on, facebook, that, i, was, str...","[i, mention, on, facebook, that, i, wa, strugg...","[I, , m, e, n, t, i, o, n, e, d, , o, n, , ..."
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral,"[bbc, news, -, amazon, boss, jeff, bezos, reje...","[bbc, news, -, amazon, boss, jeff, bezo, rejec...","[B, B, C, , N, e, w, s, , -, , A, m, a, z, ..."
2,@Microsoft Why do I pay for WORD when it funct...,Negative,"[@, microsoft, why, do, i, pay, for, word, whe...","[@, microsoft, whi, do, i, pay, for, word, whe...","[@, M, i, c, r, o, s, o, f, t, , W, h, y, , ..."
3,"CSGO matchmaking is so full of closet hacking,...",Negative,"[csgo, matchmaking, is, so, full, of, closet, ...","[csgo, matchmak, is, so, full, of, closet, hac...","[C, S, G, O, , m, a, t, c, h, m, a, k, i, n, ..."
4,Now the President is slapping Americans in the...,Neutral,"[now, the, president, is, slapping, americans,...","[now, the, presid, is, slap, american, in, the...","[N, o, w, , t, h, e, , P, r, e, s, i, d, e, ..."
...,...,...,...,...,...
995,⭐️ Toronto is the arts and culture capital of ...,Irrelevant,"[⭐️, toronto, is, the, arts, and, culture, cap...","[⭐️, toronto, is, the, art, and, cultur, capit...","[⭐, ️, , T, o, r, o, n, t, o, , i, s, , t, ..."
996,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,Irrelevant,"[this, is, actually, a, good, move, tot, bring...","[thi, is, actual, a, good, move, tot, bring, m...","[t, H, I, S, , I, S, , A, C, T, U, A, L, L, ..."
997,Today sucked so it’s time to drink wine n play...,Positive,"[today, sucked, so, it, ’, s, time, to, drink,...","[today, suck, so, it, ’, s, time, to, drink, w...","[T, o, d, a, y, , s, u, c, k, e, d, , s, o, ..."
998,Bought a fraction of Microsoft today. Small wins.,Positive,"[bought, a, fraction, of, microsoft, today, .,...","[bought, a, fraction, of, microsoft, today, .,...","[B, o, u, g, h, t, , a, , f, r, a, c, t, i, ..."


Removing stopwords, punctuation, HTML tags, emojis

In [117]:
from nltk.corpus import stopwords

stop_words= set(stopwords.words('english'))

In [118]:
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML
    text = emoji.replace_emoji(text, replace="")  # Remove emojis
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = " ".join([word for word in text.split() if word.lower() not in stop_words])  # Remove stopwords
    return text

text["cleaned_text"] = text["text"].apply(clean_text)
text

Unnamed: 0,text,sentiment,tokens,stemmed,lemmatization,cleaned_text
0,I mentioned on Facebook that I was struggling ...,Irrelevant,"[i, mentioned, on, facebook, that, i, was, str...","[i, mention, on, facebook, that, i, wa, strugg...","[I, , m, e, n, t, i, o, n, e, d, , o, n, , ...",mentioned Facebook struggling motivation go ru...
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral,"[bbc, news, -, amazon, boss, jeff, bezos, reje...","[bbc, news, -, amazon, boss, jeff, bezo, rejec...","[B, B, C, , N, e, w, s, , -, , A, m, a, z, ...",BBC News Amazon boss Jeff Bezos rejects claims...
2,@Microsoft Why do I pay for WORD when it funct...,Negative,"[@, microsoft, why, do, i, pay, for, word, whe...","[@, microsoft, whi, do, i, pay, for, word, whe...","[@, M, i, c, r, o, s, o, f, t, , W, h, y, , ...",Microsoft pay WORD functions poorly SamsungUS ...
3,"CSGO matchmaking is so full of closet hacking,...",Negative,"[csgo, matchmaking, is, so, full, of, closet, ...","[csgo, matchmak, is, so, full, of, closet, hac...","[C, S, G, O, , m, a, t, c, h, m, a, k, i, n, ...",CSGO matchmaking full closet hacking truly awf...
4,Now the President is slapping Americans in the...,Neutral,"[now, the, president, is, slapping, americans,...","[now, the, presid, is, slap, american, in, the...","[N, o, w, , t, h, e, , P, r, e, s, i, d, e, ...",President slapping Americans face really commi...
...,...,...,...,...,...,...
995,⭐️ Toronto is the arts and culture capital of ...,Irrelevant,"[⭐️, toronto, is, the, arts, and, culture, cap...","[⭐️, toronto, is, the, art, and, cultur, capit...","[⭐, ️, , T, o, r, o, n, t, o, , i, s, , t, ...",Toronto arts culture capital Canada wonder wan...
996,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,Irrelevant,"[this, is, actually, a, good, move, tot, bring...","[thi, is, actual, a, good, move, tot, bring, m...","[t, H, I, S, , I, S, , A, C, T, U, A, L, L, ...",ACTUALLY GOOD MOVE TOT BRING VIEWERS one peopl...
997,Today sucked so it’s time to drink wine n play...,Positive,"[today, sucked, so, it, ’, s, time, to, drink,...","[today, suck, so, it, ’, s, time, to, drink, w...","[T, o, d, a, y, , s, u, c, k, e, d, , s, o, ...",Today sucked time drink wine n play borderland...
998,Bought a fraction of Microsoft today. Small wins.,Positive,"[bought, a, fraction, of, microsoft, today, .,...","[bought, a, fraction, of, microsoft, today, .,...","[B, o, u, g, h, t, , a, , f, r, a, c, t, i, ...",Bought fraction Microsoft today Small wins


Vectorization: Bag of Words (BoW), Term Frequency - Inverse Document Frequency (TF-IDF), Word2Vec, GloVe

In [119]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(text["cleaned_text"])

In [120]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(text["cleaned_text"])

In [None]:
from gensim.models import Word2Vec

sentences = text["tokens"].tolist()
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

word_vectors = [word2vec_model.wv[word] for word in text["tokens"][0] if word in word2vec_model.wv]

[array([-0.14980662,  0.26358312,  0.27949074,  0.16029172,  0.13430594,
        -0.770063  ,  0.19851112,  0.98671335, -0.45796642, -0.34268218,
        -0.04534086, -0.6037515 , -0.02687244,  0.4089622 ,  0.24039221,
        -0.25022992, -0.00730229, -0.46183082, -0.2908544 , -0.9100189 ,
         0.20437923,  0.19696611,  0.4722108 , -0.31159252, -0.03714167,
         0.09962662, -0.4297273 , -0.21443681, -0.36436352,  0.08222747,
         0.438326  , -0.00565408,  0.45318633, -0.5237843 , -0.19714643,
         0.47990394,  0.10523114, -0.16463596, -0.23657478, -0.7222748 ,
         0.15276782, -0.36973652, -0.2262307 ,  0.04806165,  0.35874307,
        -0.25258195, -0.34352645, -0.11285033,  0.21599789,  0.24966887,
         0.30207622, -0.45419502, -0.07115522, -0.04190588, -0.20838967,
         0.2006064 ,  0.22590855, -0.0462656 , -0.37449908,  0.18010317,
         0.13930087,  0.11109078,  0.18737635,  0.05346287, -0.3358487 ,
         0.60402745,  0.17272697,  0.40838048, -0.5

Naive Bayes, SVM, Random Forest algorithm testing

In [123]:
from sklearn.model_selection import train_test_split

X = X_tfidf  
y = text["sentiment"] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [124]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred))


Naive Bayes Accuracy: 0.545


In [125]:
from sklearn.svm import SVC

svm = SVC(kernel="linear")
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

SVM Accuracy: 0.53


In [126]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Accuracy: 0.5
