In [24]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle

In [2]:
data=pd.read_csv("twitter_sentiment.csv",nrows=10000)
data.sample(5)


Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
9365,12810,Xbox(Xseries),Irrelevant,HELLS YES!!! SO PUMPED TO FINALLY SEE SOME VAM...
1463,2656,Borderlands,Positive,"My person, you are the best."
952,2569,Borderlands,Positive,Y’all stick with Mario Kart and Borderlands. P...
2810,1688,CallOfDutyBlackopsColdWar,Positive,I have enjoyed the sure the . Happy Black Bom...
6085,247,Amazon,Negative,@ amazon have scheduled reverse pick up since ...


#### Data cleaning 

In [3]:
data=data[['Positive','im getting on borderlands and i will murder you all ,']].reset_index(drop=True)
data.head()

Unnamed: 0,Positive,"im getting on borderlands and i will murder you all ,"
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [4]:
data.columns=['sentiment','text']
data.head()

Unnamed: 0,sentiment,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [5]:
data.shape

(10000, 2)

In [6]:
#checking and dropping duplicates
data.duplicated().sum()
data.drop_duplicates(keep='first',inplace=True)
data.shape

(9429, 2)

In [7]:
# checking nan values
data.isna().sum()

sentiment    0
text         4
dtype: int64

In [8]:
data.dropna(subset=['text'],axis=0,inplace=True)
data.shape

(9425, 2)

In [9]:
lencoder=LabelEncoder()
data['sentiment']=lencoder.fit_transform(data['sentiment'])

In [10]:
lencoder.inverse_transform([0,1,2,3])

array(['Irrelevant', 'Negative', 'Neutral', 'Positive'], dtype=object)

### Text processing

In [11]:
data.sample(10)

Unnamed: 0,sentiment,text
641,3,Who runs borderlands 3 on xbox that game is so...
9465,0,4 This is a lie.. I've had a taste... it taste...
7079,0,FUCK YALL TOXIC PEOPLE ON OVERWATCH
2560,2,ZOMBIES IS BACK!!! The fucking good.
6384,2,Best fantasy book I've read in ages. The story...
971,1,This cricket has been the worst hivemind of fa...
4619,2,@BlackOpsColdWar nice to know. You're treated....
3746,0,Know your banks except the history that hurts ...
5767,2,RT @ CBInsights: Yes
903,3,The Atleast I have Borderlands to come cheer m...


In [12]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def text_process(text):
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    
    # Convert tokens to lowercase
    tokens = [word.lower() for word in tokens]
    
    # Convert list of tokens to a single string
    text = ' '.join(tokens)
    
    # Function to remove HTML tags
    def remove_html_tags(text):
        clean_text = re.sub(r'<.*?>', '', text)
        return clean_text
    
    # Function to remove stopwords
    def remove_stopwords(text):
        words = [word for word in text.split() if word.lower() not in stop_words]
        return " ".join(words)
    
    # Function to clean URLs
    def clean_url(text):
        text = re.sub(r"((https:|http|ftp)?(:\/\/)?(www\.)?)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)", ' ', text)
        return re.sub(r'/', ' / ', text)
    
    # Function to clean punctuations
    def clean_punctuations(text):
        return text.translate(str.maketrans('', '', string.punctuation))
    
    # Function to remove repeating characters
    def clean_repeating_char(text):
        return re.sub(r"(.)\1\1+", r"\1\1", text)
    
    # Function to clean numbers
    def clean_numbers(text):
        return re.sub('[0-9]+', '', text)
    
    # Function to remove hashtags
    def remove_hashtag(text):
        return re.sub('#[\w\d]+', ' ', text)
    
    # Function to clean usernames
    def clean_username(text):
        return re.sub('@[^\s]+', ' ', text)
    
    # Function to clean emojis and non-ASCII characters
    def clean_non_ascii(text):
        text = text.encode("ascii", "ignore").decode()
        return text
    
    # Function to remove images
    def remove_images(tweet):
        cleaned_tweet = re.sub(r"pic\.twitter\.com/\S+", '', tweet)
        cleaned_tweet = re.sub("\w+(\.png|\.jpg|\.gif|\.jpeg)", " ", cleaned_tweet)
        return cleaned_tweet
    
    # Function to lemmatize words
    def lemmatize_words(text):
        return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    
    # Apply all preprocessing steps
    text = remove_html_tags(text)
    text = remove_stopwords(text)
    text = clean_url(text)
    text = clean_punctuations(text)
    text = clean_repeating_char(text)
    text = clean_numbers(text)
    text = remove_hashtag(text)
    text = clean_username(text)
    text = clean_non_ascii(text)  
    text = remove_images(text)
    text = lemmatize_words(text)
    
    return text


In [13]:
data['text_cleaned'] = data['text'].apply(text_process)

In [14]:
data.sample(10)

Unnamed: 0,sentiment,text,text_cleaned
8946,3,God hates me bc I can't connect to the servers...,god hate bc ca nt connect server game actually...
6538,2,Browse Our Collection in Healthy Products at A...,browse collection healthy product amazon inclu...
7895,3,"People are mad hating on this skin. Well, I th...",people mad hating skin well think skin absolut...
3989,0,I give up... Too tired. RNG hated me for years...,give tired rng hated year already still nt let go
9803,3,"To be fair, this looks way cleaner & more shar...",fair look way cleaner sharp sleek design xbox ...
9542,1,Very Important,important
2268,1,@ DuvalMagic why tf can I just free war but no...,duvalmagic tf free war release plz someone gea...
4745,2,I played this interesting quiz on Amazon - Try...,played interesting quiz amazon try luck chance...
8816,0,me four hours ago: i sense........ something m...,four hour ago sense something moving overwatch...
2742,1,This shit looks like the latest World War II z...,shit look like latest world war ii zombie


### Model building

In [15]:
# vextorizer
tfidf=TfidfVectorizer(max_features=10000)
X=tfidf.fit_transform(data['text_cleaned']).toarray()
X.shape

(9425, 7802)

In [16]:
# Normalization
scaler=MinMaxScaler()
X=scaler.fit_transform(X)

In [17]:
y=data['sentiment'].to_numpy()
y=y.reshape(-1)
y

array([3, 3, 3, ..., 3, 3, 3])

In [18]:
# train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,stratify=y,shuffle=True,random_state=42)

In [19]:
X_train.shape

(6597, 7802)

In [20]:
#model training using RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=80, random_state=42)
rfc.fit(X_train, y_train)

In [21]:
# evaluation
rfc_pred = rfc.predict(X_test)
print(classification_report(y_test,rfc_pred))
print(f"The accuracy score is {accuracy_score(y_test,rfc_pred)}" )

              precision    recall  f1-score   support

           0       0.94      0.88      0.91       522
           1       0.94      0.91      0.92       658
           2       0.90      0.94      0.92       744
           3       0.92      0.95      0.93       904

    accuracy                           0.92      2828
   macro avg       0.93      0.92      0.92      2828
weighted avg       0.92      0.92      0.92      2828

The accuracy score is 0.9229137199434229


In [29]:
# Saving the vectorizer
pickle.dump(tfidf,open('rfc_vectorizer.pkl','wb'))
# Saving the model
pickle.dump(rfc,open("Rfc_model.pkl","wb"))

