In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import re

In [4]:
df = pd.read_csv('data/tweet_emotions.csv')
df

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [5]:
df.info() # no null values detected

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


### Preprocessing the data

In [7]:
import re
li = []
for x in df['content']:
    x = re.sub(r'http\S+|www\S+|http\S+','', x, flags=re.MULTILINE)
    x = re.sub(r'\.',' ',x)
    x = re.sub(r'([?!.,])\1+', r'\1', x)  # removes duplicacy like ??? with ?
    x = re.sub(r'(\w)\1{1,}', r'\1', x) # replaces words like soooo with so
    x = re.sub(r'\b[a-zA-Z]+\*+\w*\b','Anger',x) # remove words which starts with a alphabet and contains any * after 
    x = re.sub(r'[^a-zA-Z0-9\s\!\?\,\@\']','',x)  # remove any other symbols other than space,.,!,?
    x = re.sub(r'\b(\S+?)([^\w\s]*\s+\1\b)+', r'\1', x) # removing repeated words. ex: F! F! F! to F!
    x = re.sub(r'@\S+','[name]',x) # replacing names with entity [NAME]
    li.append(x)

In [8]:
li

['[name] i know  i was listenin to bad habit earlier and i started freakin at his part ',
 'Layin n bed with a headache  ugh   waitin on your cal   ',
 'Funeral ceremony   glomy friday   ',
 'wants to hang out with friends SON!',
 '[name] We want to trade with someone who has Houston tickets, but no one wil ',
 "Repinging [name] why didn't you go to prom? BC my bf didn't like my friends",
 "I should be slep, but im not! thinking about an old friend who I want  but he's maried now  damn, amp he wants me 2! scandalous!",
 'Hm   is down',
 '[name] Charlene my love  I mis you',
 "[name] I'm sory  at least it's Friday?",
 'cant fal aslep',
 'Choked on her retainers',
 'Ugh! I have to beat this stupid song to get to the next  rude!',
 '[name] if u watch the hils in london u wil realise what tourture it is because were weks and weks late  i just watch itonlinelol',
 'Got the news',
 'The storm is here and the electricity is gone',
 '[name] agred',
 "So slepy again and it's not even that late 

### Tokenization

In [9]:
li1 = []
for x in li:
    li1.append(x.split())

In [10]:
li1[0]

['[name]',
 'i',
 'know',
 'i',
 'was',
 'listenin',
 'to',
 'bad',
 'habit',
 'earlier',
 'and',
 'i',
 'started',
 'freakin',
 'at',
 'his',
 'part']

In [11]:
import spacy 
  
nlp = spacy.load('en_core_web_sm') 
  
sentence = "I loved trip to paris with john, but now i miss him"
  
doc = nlp(sentence) 
for ent in doc.ents: 
    sentence = sentence.replace(ent.text,'['+ent.label_+']')
sentence

'I loved trip to [GPE] with [PERSON], but now i miss him'

### NER (Name Entity Recognition)

In [46]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

nlp = spacy.load("en_core_web_sm")

li2 =[]
def ner(li):
    for i in li:
        text = " ".join(i)
        doc = nlp(text)
        for i in doc.ents:
            text = text.replace(ent.text,'['+ent.label_+']')
        li2.append(text.split())
ner(li1) 

In [None]:
# saving the list as json format
import json
with open("li2.json",'w') as f:
    json.dump(li2,f)

In [13]:
# ner reduces noise by replacing common words like names,locations and others with their classification