In [1]:
import pandas as pd

import re
import string

import nltk
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('/kaggle/input/masculinity-saturday-twitter-sentiment-analysis/twitter_sentiment_analysis_masculinity_saturday_data_annotated.csv')

In [3]:
print(df.head())

   Unnamed: 0           tweet_created_at  \
0           0  2022-12-30 22:37:46+00:00   
1           1  2022-12-30 22:30:58+00:00   
2           2  2022-12-30 22:18:00+00:00   
3           3  2022-12-30 20:42:54+00:00   
4           4  2022-12-30 20:32:23+00:00   

                                                text  Positive  Negative  \
0  Men who have refused to listen to/attend @amer...     0.000     0.073   
1                       @OvOBrezzzy Take amerix next     0.000     0.000   
2  @AyodeleYo11 @amerix Nah I’m good my masculini...     0.493     0.068   
3  @Amerix_DontSimp There is no need for dating i...     0.000     0.180   
4  @DyeAnna7 @amerix @kibe From what cos nothing ...     0.110     0.000   

   Neutral  Compound Sentiment  
0    0.927   -0.2960  Negative  
1    1.000    0.0000   Neutral  
2    0.439    0.8271  Positive  
3    0.820   -0.2960  Negative  
4    0.890    0.6908  Positive  


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248738 entries, 0 to 248737
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        248738 non-null  int64  
 1   tweet_created_at  248738 non-null  object 
 2   text              248738 non-null  object 
 3   Positive          248738 non-null  float64
 4   Negative          248738 non-null  float64
 5   Neutral           248738 non-null  float64
 6   Compound          248738 non-null  float64
 7   Sentiment         248738 non-null  object 
dtypes: float64(4), int64(1), object(3)
memory usage: 15.2+ MB


In [5]:
df = df.drop(columns=['Unnamed: 0', 'Positive', 'Negative',
       'Neutral', 'Compound'])

In [6]:
df.columns

Index(['tweet_created_at', 'text', 'Sentiment'], dtype='object')

In [7]:
df['tweet_created_at'] = df['tweet_created_at'].astype('datetime64[ns]')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248738 entries, 0 to 248737
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   tweet_created_at  248738 non-null  datetime64[ns]
 1   text              248738 non-null  object        
 2   Sentiment         248738 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 5.7+ MB


In [9]:
print(df.head())

     tweet_created_at                                               text  \
0 2022-12-30 22:37:46  Men who have refused to listen to/attend @amer...   
1 2022-12-30 22:30:58                       @OvOBrezzzy Take amerix next   
2 2022-12-30 22:18:00  @AyodeleYo11 @amerix Nah I’m good my masculini...   
3 2022-12-30 20:42:54  @Amerix_DontSimp There is no need for dating i...   
4 2022-12-30 20:32:23  @DyeAnna7 @amerix @kibe From what cos nothing ...   

  Sentiment  
0  Negative  
1   Neutral  
2  Positive  
3  Negative  
4  Positive  


In [10]:
STOPWORDS = set(stopwords.words('english'))

def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df['text'] = df['text'].apply(lambda text: cleaning_stopwords(text))

In [11]:
df.head()

Unnamed: 0,tweet_created_at,text,Sentiment
0,2022-12-30 22:37:46,Men refused listen to/attend @amerix classes m...,Negative
1,2022-12-30 22:30:58,@OvOBrezzzy Take amerix next,Neutral
2,2022-12-30 22:18:00,"@AyodeleYo11 @amerix Nah I’m good masculinity,...",Positive
3,2022-12-30 20:42:54,@Amerix_DontSimp There need dating first place,Negative
4,2022-12-30 20:32:23,@DyeAnna7 @amerix @kibe From cos nothing chang...,Positive


In [12]:
def cleaning_URLs(data):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',data)
df['text']= df['text'].apply(lambda x: cleaning_URLs(x))

In [13]:
df.head()

Unnamed: 0,tweet_created_at,text,Sentiment
0,2022-12-30 22:37:46,Men refused listen to/attend @amerix classes m...,Negative
1,2022-12-30 22:30:58,@OvOBrezzzy Take amerix next,Neutral
2,2022-12-30 22:18:00,"@AyodeleYo11 @amerix Nah I’m good masculinity,...",Positive
3,2022-12-30 20:42:54,@Amerix_DontSimp There need dating first place,Negative
4,2022-12-30 20:32:23,@DyeAnna7 @amerix @kibe From cos nothing chang...,Positive


In [14]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
df['text']= df['text'].apply(lambda x: cleaning_punctuations(x))

In [15]:
df.head()

Unnamed: 0,tweet_created_at,text,Sentiment
0,2022-12-30 22:37:46,Men refused listen toattend amerix classes men...,Negative
1,2022-12-30 22:30:58,OvOBrezzzy Take amerix next,Neutral
2,2022-12-30 22:18:00,AyodeleYo11 amerix Nah I’m good masculinity he...,Positive
3,2022-12-30 20:42:54,AmerixDontSimp There need dating first place,Negative
4,2022-12-30 20:32:23,DyeAnna7 amerix kibe From cos nothing changing...,Positive
