## Final project

Data Cleaning:
- Clean encoding (ÛÏ)
- Translate slang to words
- Potentially generalize emojis, links, hashtags to something like -emoji- or -link-

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import glob
import os
from pathlib import Path
import re
import plotly.express as px
from collections import Counter
pd.set_option('display.max_colwidth', 1000)
import nltk
import gensim

### Loading Data

In [2]:
repo = Path(os.getcwd())
data_path = repo / '..' / 'data'

In [3]:
df = pd.read_csv(data_path / '01_raw' / 'train.csv')

### Feature Engineering

In [4]:
# mention features
df['mentions'] = df['text'].apply(lambda x: re.findall(r'@([A-Za-z0-9_]+)', x))
df['mentions_cnt'] = df['text'].apply(lambda x: re.findall(r'@([A-Za-z0-9_]+)', x)).str.len()
df['mentions_bool'] = np.where(df['mentions_cnt'] >= 1, 1, 0)

In [5]:
# hashtag features
df['hashtags'] = df['text'].apply(lambda x: re.findall(r'#([A-Za-z0-9_]+)', x))
df['hashtags_cnt'] = df['text'].apply(lambda x: re.findall(r'#([A-Za-z0-9_]+)', x)).str.len()
df['hashtags_bool'] = np.where(df['hashtags_cnt'] >= 1, 1, 0)

In [6]:
# link features 
df['links'] = df['text'].apply(lambda x: re.findall(r'https?://[^\s<>"]+|www\.[^\s<>"]+', x))
df['links_cnt'] = df['text'].apply(lambda x: re.findall(r'https?://[^\s<>"]+|www\.[^\s<>"]+', x)).str.len()
df['links_bool'] = np.where(df['links_cnt'] >= 1, 1, 0)

In [7]:
# percentage of characters in tweet that are lower case, upper case, title case( first letter upper, rest lower) 
df['uprchar_pct'] = df['text'].str.count(r'[A-Z]') / df['text'].str.len()
df['lwrchar_pct'] = df['text'].str.count(r'[a-z]') / df['text'].str.len()

In [8]:
def low_pct(x):
    tweet = x.split()
    lower_cases = [word for word in tweet if word.islower()]
    return len(lower_cases) / len(tweet)

def up_pct(x):
    tweet = x.split()
    upper_cases = [word for word in tweet if word.isupper()]
    return len(upper_cases) / len(tweet)

def title_pct(x):
    tweet = x.split()
    title_cases = [word for word in tweet if word.istitle()]
    return len(title_cases) / len(tweet)

In [9]:
# percentage of words in tweet that are lower case, upper case, title case( first letter upper, rest lower) 
df['lwrcase_pct'] = df['text'].apply(lambda x: low_pct(x))
df['upprcase_pct'] = df['text'].apply(lambda x: up_pct(x))
df['titlecase_pct'] = df['text'].apply(lambda x: title_pct(x))

### Data Cleaning

In [11]:
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = [',','(',')','?','[',']','$','.',':',"'s",'of','the',
                '!','"','"','/']
stopwords.extend(newStopWords)
stopwords = list(stopwords)

In [12]:
tweet_col = []
for tweet in list(df.text):
    tweet = re.sub(r'@([A-Za-z0-9_]+)', ' __mention__ ', tweet)
    hash_words = re.findall(r'#([A-Za-z0-9_]+)', tweet)
    tweet = re.sub(r'#([A-Za-z0-9_]+)', ' __hashtag__ ', tweet)
    tweet = re.sub(r'https?://[^\s<>"]+|www\.[^\s<>"]+', ' __link__ ',tweet)
    tweet = list(gensim.utils.tokenize(tweet))
    tweet = [word.lower() for word in tweet if word.lower() not in stopwords]
    tweet = ' '.join(tweet)
    if len(hash_words) > 0:
        hash_words = ' '.join(hash_words)
        tweet = tweet + ' ' + hash_words
    tweet_col.append(tweet)
df['tweet'] = tweet_col

In [17]:
df.sample(10)

Unnamed: 0,id,keyword,location,text,target,mentions,mentions_cnt,mentions_bool,hashtags,hashtags_cnt,hashtags_bool,links,links_cnt,links_bool,uprchar_pct,lwrchar_pct,lwrcase_pct,upprcase_pct,titlecase_pct,tweet
2669,3831,detonate,,@WoundedPigeon http://t.co/s9soAeVcVo Detonate by @ApolloBrown ft. M.O.P.,0,"[WoundedPigeon, ApolloBrown]",2,1,[],0,0,[http://t.co/s9soAeVcVo],1,1,0.150685,0.60274,0.285714,0.142857,0.285714,__mention__ __link__ detonate __mention__ ft p
2749,3952,devastation,"Wasington, DC",70 Years After Atomic Bombs Japan Still Struggles With War Past: The anniversary of the devastation wrought b... http://t.co/pmS4pMuR0q,1,[],0,0,[],0,0,[http://t.co/pmS4pMuR0q],1,1,0.103704,0.666667,0.315789,0.0,0.578947,years atomic bombs japan still struggles war past anniversary devastation wrought b __link__
4165,5918,harm,å_: ?? ÌÑ ? : ?,someone just reblogged a picture of self harm scars oh please its 2015 can we stop,0,[],0,0,[],0,0,[],0,0,0.0,0.768293,0.9375,0.0,0.0,someone reblogged picture self harm scars oh please stop
3060,4391,earthquake,#keepthefaith J&J,Earthquake drill ??,1,[],0,0,[],0,0,[],0,0,0.052632,0.736842,0.333333,0.0,0.333333,earthquake drill
2463,3531,derailment,India,25 killed 50 injured in Madhya Pradesh twin train derailment http://t.co/DNU5HWSxo2,1,[],0,0,[],0,0,[http://t.co/DNU5HWSxo2],1,1,0.096386,0.650602,0.545455,0.0,0.181818,killed injured madhya pradesh twin train derailment __link__
3353,4799,evacuated,WA State,Entire town of Roosevelt Wash. evacuated because of wildfire http://t.co/CmwEIojJ55,1,[],0,0,[],0,0,[http://t.co/CmwEIojJ55],1,1,0.084337,0.710843,0.6,0.0,0.3,entire town roosevelt wash evacuated wildfire __link__
1856,2668,crush,"San Diego, Texas.",Love love love do you remember your first crush ? ??,0,[],0,0,[],0,0,[],0,0,0.019231,0.730769,0.727273,0.0,0.090909,love love love remember first crush
2828,4064,displaced,,#KCA #VoteJKT48ID 12News: UPDATE: A family of 3 has been displaced after fired damaged housed near 90th and Osborn. Fire extinguished no iÛ_,1,[],0,0,"[KCA, VoteJKT48ID]",2,1,[],0,0,0.134752,0.602837,0.608696,0.130435,0.173913,__hashtag__ __hashtag__ news update family displaced fired damaged housed near th osborn fire extinguished û_ KCA VoteJKT48ID
6890,9877,traumatised,Tunbridge Wells,@PerkPearl that's just not on. I'd be traumatised are you OK? The car has gone and now for #GBBO and relax.....,0,[PerkPearl],1,1,[GBBO],1,1,[],0,0,0.09009,0.630631,0.761905,0.095238,0.047619,__mention__ traumatised ok car gone __hashtag__ relax GBBO
1521,2195,catastrophic,"Dublin, Ireland",'Kessler Syndrome' is the name for the catastrophic exponential proliferation of Space debris and destruction of satellites. #GravityMovie,1,[],0,0,[GravityMovie],1,1,[],0,0,0.036232,0.811594,0.777778,0.0,0.166667,kessler syndrome name catastrophic exponential proliferation space debris destruction satellites __hashtag__ GravityMovie


In [14]:
df.to_csv(data_path / '02_clean' / 'train.csv', index=False)