# Exploratory data analysis

### 1. Import libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### 2. Load dataset

In [3]:
df = pd.read_csv("../data/train_E6oV3lV.csv")
df.head(5)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


### 3. Check dataset

In [4]:
df.shape

(31962, 3)

In [5]:
df.dtypes

id        int64
label     int64
tweet    object
dtype: object

In [6]:
df.describe()

Unnamed: 0,id,label
count,31962.0,31962.0
mean,15981.5,0.070146
std,9226.778988,0.255397
min,1.0,0.0
25%,7991.25,0.0
50%,15981.5,0.0
75%,23971.75,0.0
max,31962.0,1.0


### 4. Check duplicate rows

In [7]:
tmp = df.duplicated(subset=df.columns.tolist(), keep=False)
duplicated_rows = df[tmp]
duplicated_rows

Unnamed: 0,id,label,tweet


### 5. Check missing or null values

In [8]:
df.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

## Data Cleaning

### 1. Import libraries

In [9]:
import re
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# from nltk.corpus import wordnet as wn

### 2. Text Cleaning

In [10]:
# df_ori = df.copy()
# df = df_ori
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [11]:
def clean_text(tweet): # Remove URLs & mentions
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r'@\w+', '', tweet)
    return tweet

In [12]:
df['tweet'] = df['tweet'].apply(clean_text)
df.head()

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so sel...
1,2,0,thanks for #lyft credit i can't use cause th...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [13]:
## https://www.geeksforgeeks.org/nlp-expand-contractions-in-text-processing/

# Expand contractions in Text Processing
def expand_contractions(tweet):
    expanded_words = []
    for tw in tweet.split():
        #if tw != contractions.fix(tw): print(tw, contractions.fix(tw))
        expanded_words.append(contractions.fix(tw))
    new_text = ' '.join(expanded_words)
    return new_text

In [14]:
df['new_text'] = df['tweet'].apply(expand_contractions)
df.head()

Unnamed: 0,id,label,tweet,new_text
0,1,0,when a father is dysfunctional and is so sel...,when a father is dysfunctional and is so selfi...
1,2,0,thanks for #lyft credit i can't use cause th...,thanks for #lyft credit i cannot use because t...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love you take with you all the time i...
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation


In [15]:
def remove_non_alpha(new_text):
    new_text = re.sub(r'[^a-zA-Z0-9\s]', '', new_text)
    return new_text

In [16]:
df['new_text'] = df['new_text'].apply(remove_non_alpha)
df.head()

Unnamed: 0,id,label,tweet,new_text
0,1,0,when a father is dysfunctional and is so sel...,when a father is dysfunctional and is so selfi...
1,2,0,thanks for #lyft credit i can't use cause th...,thanks for lyft credit i cannot use because th...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love you take with you all the time in...
4,5,0,factsguide: society now #motivation,factsguide society now motivation


### 2. Tokenization

In [17]:
def tokenizer(new_text):
    tokens = word_tokenize(new_text)
    return tokens

In [18]:
df['tokens'] = df['new_text'].apply(tokenizer)
df.head()

Unnamed: 0,id,label,tweet,new_text,tokens
0,1,0,when a father is dysfunctional and is so sel...,when a father is dysfunctional and is so selfi...,"[when, a, father, is, dysfunctional, and, is, ..."
1,2,0,thanks for #lyft credit i can't use cause th...,thanks for lyft credit i cannot use because th...,"[thanks, for, lyft, credit, i, can, not, use, ..."
2,3,0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]"
3,4,0,#model i love u take with u all the time in ...,model i love you take with you all the time in...,"[model, i, love, you, take, with, you, all, th..."
4,5,0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]"


In [19]:
def to_lowercase(lst_tokens):
    return [token.lower() for token in lst_tokens]

In [20]:
df['tokens'] = df['tokens'].apply(to_lowercase)
df.head()

Unnamed: 0,id,label,tweet,new_text,tokens
0,1,0,when a father is dysfunctional and is so sel...,when a father is dysfunctional and is so selfi...,"[when, a, father, is, dysfunctional, and, is, ..."
1,2,0,thanks for #lyft credit i can't use cause th...,thanks for lyft credit i cannot use because th...,"[thanks, for, lyft, credit, i, can, not, use, ..."
2,3,0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]"
3,4,0,#model i love u take with u all the time in ...,model i love you take with you all the time in...,"[model, i, love, you, take, with, you, all, th..."
4,5,0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]"


### 3. Removing Stopwords

In [21]:
def remove_stopwords(lst_tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in lst_tokens if token not in stop_words]

In [22]:
df['tokens'] = df['tokens'].apply(remove_stopwords)
df.head()

Unnamed: 0,id,label,tweet,new_text,tokens
0,1,0,when a father is dysfunctional and is so sel...,when a father is dysfunctional and is so selfi...,"[father, dysfunctional, selfish, drags, kids, ..."
1,2,0,thanks for #lyft credit i can't use cause th...,thanks for lyft credit i cannot use because th...,"[thanks, lyft, credit, use, offer, wheelchair,..."
2,3,0,bihday your majesty,bihday your majesty,"[bihday, majesty]"
3,4,0,#model i love u take with u all the time in ...,model i love you take with you all the time in...,"[model, love, take, time]"
4,5,0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, motivation]"


### 4. Lemmatization or Stemming

In [23]:
def lemmatizer_tokens(lst_tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in lst_tokens]

In [24]:
df['tokens'] = df['tokens'].apply(lemmatizer_tokens)
df.head()

Unnamed: 0,id,label,tweet,new_text,tokens
0,1,0,when a father is dysfunctional and is so sel...,when a father is dysfunctional and is so selfi...,"[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0,thanks for #lyft credit i can't use cause th...,thanks for lyft credit i cannot use because th...,"[thanks, lyft, credit, use, offer, wheelchair,..."
2,3,0,bihday your majesty,bihday your majesty,"[bihday, majesty]"
3,4,0,#model i love u take with u all the time in ...,model i love you take with you all the time in...,"[model, love, take, time]"
4,5,0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, motivation]"


In [27]:
df[['id', 'label', 'tokens']].head(20)

Unnamed: 0,id,label,tokens
0,1,0,"[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0,"[thanks, lyft, credit, use, offer, wheelchair,..."
2,3,0,"[bihday, majesty]"
3,4,0,"[model, love, take, time]"
4,5,0,"[factsguide, society, motivation]"
5,6,0,"[22, huge, fan, fare, big, talking, leave, cha..."
6,7,0,"[camping, tomorrow, danny]"
7,8,0,"[next, school, year, year, exam, think, school..."
8,9,0,"[love, land, allin, cavs, champion, cleveland,..."
9,10,0,"[welcome, gr8]"
