<a href="https://colab.research.google.com/github/zahrasa/Tweet_Sentiment_Analysis/blob/main/Tweet_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import os
import pandas as pd 
import numpy as np 
import csv

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.5.3.tar.gz (14 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: gdown
  Building wheel for gdown (PEP 517) ... [?25l[?25hdone
  Created wheel for gdown: filename=gdown-4.5.3-py3-none-any.whl size=14840 sha256=161393ab3ecc98e216c37fac18df0a9b07c02ddc427f57c6fb15e853e484a342
  Stored in directory: /tmp/pip-ephem-wheel-cache-eb6g46aq/wheels/94/8d/0b/bdcd83555c3555f91a33f6c2384428d9f163c7d75ab0d272b4
Successfully built gdown
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.5.3


In [3]:
# Download the dataset
! rm -rf * 
! gdown 1KszzQywPgfVNhSfp9RwlCR5jLEX1PunZ

Downloading...
From: https://drive.google.com/uc?id=1KszzQywPgfVNhSfp9RwlCR5jLEX1PunZ
To: /content/comments.zip
  0% 0.00/1.25M [00:00<?, ?B/s]100% 1.25M/1.25M [00:00<00:00, 130MB/s]


In [4]:
! unzip "comments.zip"
! rm -r "comments.zip"

Archive:  comments.zip
  inflating: test.csv                
  inflating: __MACOSX/._test.csv     
  inflating: train.csv               
  inflating: __MACOSX/._train.csv    


## working with train and testing.csv

In [5]:
train = pd.read_csv('./train.csv', encoding='utf-8')
test = pd.read_csv('./test.csv', encoding='utf-8')
train.tail()

Unnamed: 0,text,sense,honor,curse,despise,situation,antihuman,roughness,slaughter,strike_support,depression_rate
15781,Would you marry a Bengali — Loooool yes ... I ...,1.5,1.75,1.5,1.5,2.25,1.0,0.25,0.25,2.0,0.0
15782,I am about to commit hate crime,4.0,4.0,3.0,3.0,2.0,3.0,4.0,4.0,4.0,2.0
15783,go back to somalia ilhan omar! -guy from the p...,3.33,3.33,2.67,1.67,2.33,0.67,0.33,0.0,3.0,0.0
15784,I agree with noah. Depression or mental illnes...,3.0,2.33,2.33,2.0,3.0,1.67,0.33,0.0,1.67,0.0
15785,Love sharing my Gf with Asian boys! HMU if you...,3.67,3.33,2.67,2.0,2.67,2.33,1.67,0.33,2.0,0.0


# Tokenizing and vectorizing text

In [6]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
#Setting English stopwords
stopword_list = nltk.corpus.stopwords.words('english')

from nltk import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize 
import requests
from string import punctuation

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
tokenizer = ToktokTokenizer()

### Removing stop words

In [8]:
#set stopwords to english
stop = set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text


{'after', 'most', 've', 'them', 'haven', 'during', 'should', 'more', "shouldn't", 'wasn', 'what', 'from', 'once', 'd', 'ain', 'am', 'in', 'no', 'been', 'any', 'was', 'an', 'it', 'couldn', 'he', 't', 'this', 'whom', 'all', 'themselves', 'through', 'her', "hasn't", 'a', 'having', "couldn't", 'herself', 'over', 'didn', "doesn't", 'where', 'who', "needn't", 'does', "don't", 'hers', 'here', 'itself', "you've", "didn't", 'y', 'there', 'needn', 'nor', "you'll", 'between', "that'll", 'be', 'each', 'him', 'my', 'same', 'were', 'again', 'wouldn', "you'd", 'll', 'for', 'will', 'myself', 'as', 'while', 'mustn', 'down', 'about', 'm', 'did', 'some', 's', "hadn't", 'why', "mightn't", 'shouldn', 'but', 'against', 'mightn', 'weren', 'hadn', "mustn't", 'if', 'doing', "you're", 'not', 'ourselves', 'your', 'because', 'yours', 'off', 'his', 'to', 'shan', "haven't", 'few', 'too', 'further', 'had', 'we', 'then', "she's", 'aren', 'just', 're', 'until', 'me', 'so', "wasn't", "shan't", 'only', 'the', "isn't", '

In [9]:
#Apply function on review column
train['text'] = train['text'].apply(remove_stopwords)
test['text'] = test['text'].apply(remove_stopwords)

train.head()

Unnamed: 0,text,sense,honor,curse,despise,situation,antihuman,roughness,slaughter,strike_support,depression_rate
0,amazing ! Kudos women strong,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,@IlhanMN Omar-you racist ! Resign ! deported !,4.0,3.67,3.33,2.67,3.0,1.33,1.33,0.67,3.0,0.0
2,feel same. El Paso made open understanding per...,1.0,1.5,0.5,0.5,2.0,1.5,0.5,0.5,1.5,0.0
3,Stop putting ugly white person tl,3.67,3.0,2.67,2.33,2.33,1.0,0.0,0.0,2.33,0.0
4,Gautam Gambhir ! ! ! Lol. Come ! ! ! Elections...,3.33,3.33,2.0,2.0,2.0,1.67,0.33,0.0,3.0,0.33


### Removing special characters

In [10]:
def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern,'',text)
    return text

In [11]:
#Apply function on review column
train['text'] = train['text'].apply(remove_special_characters)
test['text'] = test['text'].apply(remove_special_characters)

train.head()

Unnamed: 0,text,sense,honor,curse,despise,situation,antihuman,roughness,slaughter,strike_support,depression_rate
0,amazing Kudos women strong,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,IlhanMN Omaryou racist Resign deported,4.0,3.67,3.33,2.67,3.0,1.33,1.33,0.67,3.0,0.0
2,feel same El Paso made open understanding pers...,1.0,1.5,0.5,0.5,2.0,1.5,0.5,0.5,1.5,0.0
3,Stop putting ugly white person tl,3.67,3.0,2.67,2.33,2.33,1.0,0.0,0.0,2.33,0.0
4,Gautam Gambhir Lol Come Elections India ...,3.33,3.33,2.0,2.0,2.0,1.67,0.33,0.0,3.0,0.33


### Stemming

In [12]:
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

In [13]:
#Apply function on review column
train['text'] = train['text'].apply(simple_stemmer)
test['text'] = test['text'].apply(simple_stemmer)

train.head()

Unnamed: 0,text,sense,honor,curse,despise,situation,antihuman,roughness,slaughter,strike_support,depression_rate
0,amaz kudo women strong,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,ilhanmn omary racist resign deport,4.0,3.67,3.33,2.67,3.0,1.33,1.33,0.67,3.0,0.0
2,feel same el paso made open understand person ...,1.0,1.5,0.5,0.5,2.0,1.5,0.5,0.5,1.5,0.0
3,stop put ugli white person tl,3.67,3.0,2.67,2.33,2.33,1.0,0.0,0.0,2.33,0.0
4,gautam gambhir lol come elect india conduct co...,3.33,3.33,2.0,2.0,2.0,1.67,0.33,0.0,3.0,0.33


### Removing low frequency words

In [14]:
freq_train = pd.Series(' '.join(train['text']).split()).value_counts()
less_five_freq_train = freq_train[(freq_train <5)]
print(less_five_freq_train)

freq_test = pd.Series(' '.join(test['text']).split()).value_counts()
less_five_freq_test = freq_test[(freq_test <5)]
print(less_five_freq_test)

train['text'] = train['text'].apply(lambda x: ' '.join(x for x in x.split() if x not in less_five_freq_train))
test['text'] = test['text'].apply(lambda x: ' '.join(x for x in x.split() if x not in less_five_freq_test))

sharif          4
stupidest       4
corybook        4
vp              4
talkin          4
               ..
nbaespnmedia    1
peaceful        1
palestina       1
frankunlaw      1
loooool         1
Length: 16169, dtype: int64
sub              4
sleep            4
wild             4
homophobia       4
side             4
                ..
epimoniamn       1
meimeifox        1
forb             1
to6              1
predominantli    1
Length: 4682, dtype: int64


### Normalizing

In [15]:
#Normalized train reviews
norm_train_reviews = train.text[:]
norm_train_reviews.head()

0                               amaz kudo women strong
1                         ilhanmn racist resign deport
2    feel same el paso made open understand person ...
3                        stop put ugli white person tl
4    lol come elect india conduct control us uk bjp...
Name: text, dtype: object

In [16]:
#Normalized test reviews
norm_test_reviews = test.text[:]

### Counting vectorizer for TFIDF

In [17]:
# Count vectorizer for TFIDF
tv = TfidfVectorizer()

# transformed train reviews
tv_train_reviews = tv.fit_transform(norm_train_reviews)

# transformed test reviews
# Do not use fit_transform for test set because same size vector for test set
tv_test_reviews = tv.transform(norm_test_reviews)

print('Tfidf_train:', tv_train_reviews.shape)
print('Tfidf_test:', tv_test_reviews.shape)

Tfidf_train: (15786, 4189)
Tfidf_test: (1754, 4189)


In [None]:
# tv.get_feature_names()

# Modeling

In [19]:
x_train = tv_train_reviews
x_test = tv_test_reviews

In [20]:
y_train = train.sense
model = RandomForestRegressor(n_estimators=50)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred = y_pred.tolist()
# Rounding to 3 digits
sense = list(np.around(np.array(y_pred),3))

In [21]:
y_train = train.honor
model = RandomForestRegressor(n_estimators=50)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred = y_pred.tolist()
honor = list(np.around(np.array(y_pred),3))

In [22]:
y_train = train.curse
model = RandomForestRegressor(n_estimators=50)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred = y_pred.tolist()
curse = list(np.around(np.array(y_pred),3))

In [23]:
y_train = train.despise
model = RandomForestRegressor(n_estimators=50)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred = y_pred.tolist()
despise = list(np.around(np.array(y_pred),3))

In [24]:
y_train = train.situation
model = RandomForestRegressor(n_estimators=50)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred = y_pred.tolist()
situation = list(np.around(np.array(y_pred),3))

In [25]:
y_train = train.antihuman
model = RandomForestRegressor(n_estimators=50)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred = y_pred.tolist()
antihuman = list(np.around(np.array(y_pred),3))

In [26]:
y_train = train.roughness
model = RandomForestRegressor(n_estimators=50)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred = y_pred.tolist()
roughness = list(np.around(np.array(y_pred),3))

In [27]:
y_train = train.slaughter
model = RandomForestRegressor(n_estimators=50)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred = y_pred.tolist()
slaughter = list(np.around(np.array(y_pred),3))

In [28]:
y_train = train.strike_support
model = RandomForestRegressor(n_estimators=50)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred = y_pred.tolist()
strike_support = list(np.around(np.array(y_pred),3))

In [29]:
y_train = train.depression_rate
model = RandomForestRegressor(n_estimators=50)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred = y_pred.tolist()
depression_rate = list(np.around(np.array(y_pred),3))

# Creating result csv

In [30]:
rows = zip(sense, honor, curse, despise, situation, antihuman, roughness, slaughter, strike_support, depression_rate)

In [31]:
import csv

with open('/content/output.csv', "w") as f:
    writer = csv.writer(f)
    # add header
    writer.writerow(['sense', 'honor', 'curse', 'despise', 'situation', 'antihuman', 'roughness', 'slaughter', 'strike_support', 'depression_rate'])
    # add results
    for row in rows:
        writer.writerow(row)