In [1]:
import pandas as pd
import numpy as np
import nltk
# %run utils.ipynb

### Importing Twitter Dataset

In [2]:
df_twitter = pd.read_csv('formatted_data/twitter.csv')
df_twitter = df_twitter.rename(columns={'clean_text':'text', 'category':'label'})
df_twitter = df_twitter.drop('Unnamed: 0', axis=1)
df_twitter

Unnamed: 0,text,label
0,what did just say vote for modi welcome bjp t...,1
1,asking his supporters prefix chowkidar their n...,1
2,answer who among these the most powerful world...,1
3,with upcoming election india saga going import...,1
4,gandhi was gay does modi,1
...,...,...
107752,engine growth modi unveils indias first 12000 ...,1
107753,modi promised 2014 lok sabha elections that be...,1
107754,why these 456 crores paid neerav modi not reco...,-1
107755,dear rss terrorist payal gawar what about modi...,-1


In [3]:
df_imdb_rating = pd.read_csv('formatted_data/imdb_rating.csv')
df_imdb_rating = df_imdb_rating.drop('Unnamed: 0', axis=1)
df_imdb_rating

Unnamed: 0,text,label
0,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,-1
1,This movie was so poorly written and directed ...,-1
2,The most interesting thing about Miryang (Secr...,1
3,"when i first read about ""berlin am meer"" i did...",-1
4,"I saw this film on September 1st, 2005 in Indi...",1
...,...,...
49620,"Man, I loved this movie! This really takes me ...",-1
49621,Recovery is an incredibly moving piece of work...,-1
49622,"You can take the crook out of the joint, but i...",-1
49623,FUTZ is the only show preserved from the exper...,-1


In [4]:
df_reddit_comment = pd.read_csv('formatted_data/reddit_comment.csv')
df_reddit_comment = df_reddit_comment.drop('Unnamed: 0', axis=1)
df_reddit_comment

Unnamed: 0,text,label
0,Funeral ceremony...gloomy friday...,-1
1,wants to hang out with friends SOON!,1
2,Re-pinging @ghostridah14: why didn't you go to...,-1
3,"I should be sleep, but im not! thinking about ...",-1
4,Hmmm. http://www.djhero.com/ is down,-1
...,...,...
30529,Succesfully following Tayla!!,1
30530,Happy Mothers Day All my love,1
30531,Happy Mother's Day to all the mommies out ther...,1
30532,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,1


### Stacking Three different dataset

In [5]:
df_combined = np.vstack([df_twitter, df_imdb_rating, df_reddit_comment]) # stacking all three dataset 
df_combined.shape

(187916, 2)

In [6]:
df_combined = pd.DataFrame(df_combined, columns=['text', 'label'])
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187916 entries, 0 to 187915
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    187916 non-null  object
 1   label   187916 non-null  object
dtypes: object(2)
memory usage: 2.9+ MB


### Check duplicates

In [7]:
df_combined.duplicated().sum()

107

In [8]:
df_combined = df_combined.drop_duplicates()
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187809 entries, 0 to 187915
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    187809 non-null  object
 1   label   187809 non-null  object
dtypes: object(2)
memory usage: 4.3+ MB


In [9]:
import re

class Preprocessing:
    def __init__(self, text):
        self.text = text
        
    def remove_URL(self):
        return re.sub(r'https?://\S+|www\.\S+','', self.text)
    
    def remove_emoji(self):
        emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
        return emoji_pattern.sub(r'', self.text)
    
    def remove_html(self):
        return re.sub(r'^[^ ]<.*?>|&([a-z0-9]+|#[0-9]\"\'\“{1,6}|#x[0-9a-f]{1,6});[^A-Za-z0-9]+', '', self.text)
    
    def remove_punct(self):
        return re.sub('[(\[.\?#@+,<>%~`!$^&\(\):;\])\1+]', '', self.text)
    
    def remove_quotes(self):
        return re.sub(r'[^A-Za-z0-9\s]+', '', self.text)

In [12]:
df = df_combined.copy()
df['plain_text'] = df['text'].apply(lambda x: Preprocessing(str(x)).remove_URL())
df['plain_text'] = df['plain_text'].apply(lambda x: Preprocessing(str(x)).remove_emoji())
df['plain_text'] = df['plain_text'].apply(lambda x: Preprocessing(str(x)).remove_html())
df['plain_text'] = df['plain_text'].apply(lambda x: Preprocessing(str(x)).remove_punct())
df['plain_text'] = df['plain_text'].apply(lambda x: Preprocessing(str(x)).remove_quotes())

df.head(6)

Unnamed: 0,text,label,plain_text
0,what did just say vote for modi welcome bjp t...,1,what did just say vote for modi welcome bjp t...
1,asking his supporters prefix chowkidar their n...,1,asking his supporters prefix chowkidar their n...
2,answer who among these the most powerful world...,1,answer who among these the most powerful world...
3,with upcoming election india saga going import...,1,with upcoming election india saga going import...
4,gandhi was gay does modi,1,gandhi was gay does modi
5,things like demonetisation gst goods and servi...,1,things like demonetisation gst goods and servi...


### Word Tokenized

In [13]:
# Tokenizing the tweet base texts.
from nltk.tokenize import word_tokenize
df['tokenized'] = df['plain_text'].apply(word_tokenize)
df.head(10)


Unnamed: 0,text,label,plain_text,tokenized
0,what did just say vote for modi welcome bjp t...,1,what did just say vote for modi welcome bjp t...,"[what, did, just, say, vote, for, modi, welcom..."
1,asking his supporters prefix chowkidar their n...,1,asking his supporters prefix chowkidar their n...,"[asking, his, supporters, prefix, chowkidar, t..."
2,answer who among these the most powerful world...,1,answer who among these the most powerful world...,"[answer, who, among, these, the, most, powerfu..."
3,with upcoming election india saga going import...,1,with upcoming election india saga going import...,"[with, upcoming, election, india, saga, going,..."
4,gandhi was gay does modi,1,gandhi was gay does modi,"[gandhi, was, gay, does, modi]"
5,things like demonetisation gst goods and servi...,1,things like demonetisation gst goods and servi...,"[things, like, demonetisation, gst, goods, and..."
6,hope tuthukudi people would prefer honest well...,1,hope tuthukudi people would prefer honest well...,"[hope, tuthukudi, people, would, prefer, hones..."
7,calm waters wheres the modi wave,1,calm waters wheres the modi wave,"[calm, waters, wheres, the, modi, wave]"
8,vote such party and leadershipwho can take fas...,-1,vote such party and leadershipwho can take fas...,"[vote, such, party, and, leadershipwho, can, t..."
9,dont play with the words was talking about the...,1,dont play with the words was talking about the...,"[dont, play, with, the, words, was, talking, a..."


In [14]:
# Lower casing clean text.

df['lower'] = df['tokenized'].apply(
    lambda x: [word.lower() for word in x])

df.head()

Unnamed: 0,text,label,plain_text,tokenized,lower
0,what did just say vote for modi welcome bjp t...,1,what did just say vote for modi welcome bjp t...,"[what, did, just, say, vote, for, modi, welcom...","[what, did, just, say, vote, for, modi, welcom..."
1,asking his supporters prefix chowkidar their n...,1,asking his supporters prefix chowkidar their n...,"[asking, his, supporters, prefix, chowkidar, t...","[asking, his, supporters, prefix, chowkidar, t..."
2,answer who among these the most powerful world...,1,answer who among these the most powerful world...,"[answer, who, among, these, the, most, powerfu...","[answer, who, among, these, the, most, powerfu..."
3,with upcoming election india saga going import...,1,with upcoming election india saga going import...,"[with, upcoming, election, india, saga, going,...","[with, upcoming, election, india, saga, going,..."
4,gandhi was gay does modi,1,gandhi was gay does modi,"[gandhi, was, gay, does, modi]","[gandhi, was, gay, does, modi]"


### Stop Word

In [20]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [21]:
# Removing stopwords.

df['stopwords_removed'] = df['lower'].apply(
    lambda x: [word for word in x if word not in stop_words])

df.head()

Unnamed: 0,text,label,plain_text,tokenized,lower,stopwords_removed
0,what did just say vote for modi welcome bjp t...,1,what did just say vote for modi welcome bjp t...,"[what, did, just, say, vote, for, modi, welcom...","[what, did, just, say, vote, for, modi, welcom...","[say, vote, modi, welcome, bjp, told, rahul, m..."
1,asking his supporters prefix chowkidar their n...,1,asking his supporters prefix chowkidar their n...,"[asking, his, supporters, prefix, chowkidar, t...","[asking, his, supporters, prefix, chowkidar, t...","[asking, supporters, prefix, chowkidar, names,..."
2,answer who among these the most powerful world...,1,answer who among these the most powerful world...,"[answer, who, among, these, the, most, powerfu...","[answer, who, among, these, the, most, powerfu...","[answer, among, powerful, world, leader, today..."
3,with upcoming election india saga going import...,1,with upcoming election india saga going import...,"[with, upcoming, election, india, saga, going,...","[with, upcoming, election, india, saga, going,...","[upcoming, election, india, saga, going, impor..."
4,gandhi was gay does modi,1,gandhi was gay does modi,"[gandhi, was, gay, does, modi]","[gandhi, was, gay, does, modi]","[gandhi, gay, modi]"


### POS TAG

In [22]:
# finding the pos
def find_pos(lst:list)->list:
    pos_ = nltk.pos_tag(lst)
    return pos_

df['pos_tag'] = df['stopwords_removed'].apply(find_pos)
df.head()

Unnamed: 0,text,label,plain_text,tokenized,lower,stopwords_removed,pos_tag
0,what did just say vote for modi welcome bjp t...,1,what did just say vote for modi welcome bjp t...,"[what, did, just, say, vote, for, modi, welcom...","[what, did, just, say, vote, for, modi, welcom...","[say, vote, modi, welcome, bjp, told, rahul, m...","[(say, VB), (vote, NN), (modi, FW), (welcome, ..."
1,asking his supporters prefix chowkidar their n...,1,asking his supporters prefix chowkidar their n...,"[asking, his, supporters, prefix, chowkidar, t...","[asking, his, supporters, prefix, chowkidar, t...","[asking, supporters, prefix, chowkidar, names,...","[(asking, VBG), (supporters, NNS), (prefix, VB..."
2,answer who among these the most powerful world...,1,answer who among these the most powerful world...,"[answer, who, among, these, the, most, powerfu...","[answer, who, among, these, the, most, powerfu...","[answer, among, powerful, world, leader, today...","[(answer, NN), (among, IN), (powerful, JJ), (w..."
3,with upcoming election india saga going import...,1,with upcoming election india saga going import...,"[with, upcoming, election, india, saga, going,...","[with, upcoming, election, india, saga, going,...","[upcoming, election, india, saga, going, impor...","[(upcoming, JJ), (election, NN), (india, NN), ..."
4,gandhi was gay does modi,1,gandhi was gay does modi,"[gandhi, was, gay, does, modi]","[gandhi, was, gay, does, modi]","[gandhi, gay, modi]","[(gandhi, NN), (gay, NN), (modi, NN)]"


In [23]:
from nltk.corpus import wordnet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [24]:
def replace_pos_tag(lst:list)->list:
    temp =[]
    for (word, pos_tag) in lst:
        temp.append((word, get_wordnet_pos(pos_tag)))
    return temp

In [25]:
df['pos'] = df['pos_tag'].apply(replace_pos_tag)
df.head()

Unnamed: 0,text,label,plain_text,tokenized,lower,stopwords_removed,pos_tag,pos
0,what did just say vote for modi welcome bjp t...,1,what did just say vote for modi welcome bjp t...,"[what, did, just, say, vote, for, modi, welcom...","[what, did, just, say, vote, for, modi, welcom...","[say, vote, modi, welcome, bjp, told, rahul, m...","[(say, VB), (vote, NN), (modi, FW), (welcome, ...","[(say, v), (vote, n), (modi, n), (welcome, a),..."
1,asking his supporters prefix chowkidar their n...,1,asking his supporters prefix chowkidar their n...,"[asking, his, supporters, prefix, chowkidar, t...","[asking, his, supporters, prefix, chowkidar, t...","[asking, supporters, prefix, chowkidar, names,...","[(asking, VBG), (supporters, NNS), (prefix, VB...","[(asking, v), (supporters, n), (prefix, v), (c..."
2,answer who among these the most powerful world...,1,answer who among these the most powerful world...,"[answer, who, among, these, the, most, powerfu...","[answer, who, among, these, the, most, powerfu...","[answer, among, powerful, world, leader, today...","[(answer, NN), (among, IN), (powerful, JJ), (w...","[(answer, n), (among, n), (powerful, a), (worl..."
3,with upcoming election india saga going import...,1,with upcoming election india saga going import...,"[with, upcoming, election, india, saga, going,...","[with, upcoming, election, india, saga, going,...","[upcoming, election, india, saga, going, impor...","[(upcoming, JJ), (election, NN), (india, NN), ...","[(upcoming, a), (election, n), (india, n), (sa..."
4,gandhi was gay does modi,1,gandhi was gay does modi,"[gandhi, was, gay, does, modi]","[gandhi, was, gay, does, modi]","[gandhi, gay, modi]","[(gandhi, NN), (gay, NN), (modi, NN)]","[(gandhi, n), (gay, n), (modi, n)]"


### Lemmitizing

In [26]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()

def lemmitization(lst:list)->list:
    temp = []
    for item in lst: 
        temp.append(lem.lemmatize(item[0], item[1]))
    return temp

In [27]:
df['lemmitized'] = df['pos'].apply(lemmitization)
df.head()

Unnamed: 0,text,label,plain_text,tokenized,lower,stopwords_removed,pos_tag,pos,lemmitized
0,what did just say vote for modi welcome bjp t...,1,what did just say vote for modi welcome bjp t...,"[what, did, just, say, vote, for, modi, welcom...","[what, did, just, say, vote, for, modi, welcom...","[say, vote, modi, welcome, bjp, told, rahul, m...","[(say, VB), (vote, NN), (modi, FW), (welcome, ...","[(say, v), (vote, n), (modi, n), (welcome, a),...","[say, vote, modi, welcome, bjp, tell, rahul, m..."
1,asking his supporters prefix chowkidar their n...,1,asking his supporters prefix chowkidar their n...,"[asking, his, supporters, prefix, chowkidar, t...","[asking, his, supporters, prefix, chowkidar, t...","[asking, supporters, prefix, chowkidar, names,...","[(asking, VBG), (supporters, NNS), (prefix, VB...","[(asking, v), (supporters, n), (prefix, v), (c...","[ask, supporter, prefix, chowkidar, name, modi..."
2,answer who among these the most powerful world...,1,answer who among these the most powerful world...,"[answer, who, among, these, the, most, powerfu...","[answer, who, among, these, the, most, powerfu...","[answer, among, powerful, world, leader, today...","[(answer, NN), (among, IN), (powerful, JJ), (w...","[(answer, n), (among, n), (powerful, a), (worl...","[answer, among, powerful, world, leader, today..."
3,with upcoming election india saga going import...,1,with upcoming election india saga going import...,"[with, upcoming, election, india, saga, going,...","[with, upcoming, election, india, saga, going,...","[upcoming, election, india, saga, going, impor...","[(upcoming, JJ), (election, NN), (india, NN), ...","[(upcoming, a), (election, n), (india, n), (sa...","[upcoming, election, india, saga, go, importan..."
4,gandhi was gay does modi,1,gandhi was gay does modi,"[gandhi, was, gay, does, modi]","[gandhi, was, gay, does, modi]","[gandhi, gay, modi]","[(gandhi, NN), (gay, NN), (modi, NN)]","[(gandhi, n), (gay, n), (modi, n)]","[gandhi, gay, modi]"


### Combining the Words a Document

In [30]:
def combine_words(lst:list)->str:
    temp = ''
    for item in lst:
        temp += item + ' '
    return temp

In [31]:
df['combined_words'] = df['lemmitized'].apply(combine_words)
df.head()

Unnamed: 0,text,label,plain_text,tokenized,lower,stopwords_removed,pos_tag,pos,lemmitized,combined_words
0,what did just say vote for modi welcome bjp t...,1,what did just say vote for modi welcome bjp t...,"[what, did, just, say, vote, for, modi, welcom...","[what, did, just, say, vote, for, modi, welcom...","[say, vote, modi, welcome, bjp, told, rahul, m...","[(say, VB), (vote, NN), (modi, FW), (welcome, ...","[(say, v), (vote, n), (modi, n), (welcome, a),...","[say, vote, modi, welcome, bjp, tell, rahul, m...",say vote modi welcome bjp tell rahul main camp...
1,asking his supporters prefix chowkidar their n...,1,asking his supporters prefix chowkidar their n...,"[asking, his, supporters, prefix, chowkidar, t...","[asking, his, supporters, prefix, chowkidar, t...","[asking, supporters, prefix, chowkidar, names,...","[(asking, VBG), (supporters, NNS), (prefix, VB...","[(asking, v), (supporters, n), (prefix, v), (c...","[ask, supporter, prefix, chowkidar, name, modi...",ask supporter prefix chowkidar name modi great...
2,answer who among these the most powerful world...,1,answer who among these the most powerful world...,"[answer, who, among, these, the, most, powerfu...","[answer, who, among, these, the, most, powerfu...","[answer, among, powerful, world, leader, today...","[(answer, NN), (among, IN), (powerful, JJ), (w...","[(answer, n), (among, n), (powerful, a), (worl...","[answer, among, powerful, world, leader, today...",answer among powerful world leader today trump...
3,with upcoming election india saga going import...,1,with upcoming election india saga going import...,"[with, upcoming, election, india, saga, going,...","[with, upcoming, election, india, saga, going,...","[upcoming, election, india, saga, going, impor...","[(upcoming, JJ), (election, NN), (india, NN), ...","[(upcoming, a), (election, n), (india, n), (sa...","[upcoming, election, india, saga, go, importan...",upcoming election india saga go important pair...
4,gandhi was gay does modi,1,gandhi was gay does modi,"[gandhi, was, gay, does, modi]","[gandhi, was, gay, does, modi]","[gandhi, gay, modi]","[(gandhi, NN), (gay, NN), (modi, NN)]","[(gandhi, n), (gay, n), (modi, n)]","[gandhi, gay, modi]",gandhi gay modi
