## Final project

Data Cleaning:
- Clean encoding (ÛÏ)
- Translate slang to words
- Potentially generalize emojis, links, hashtags to something like -emoji- or -link-

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import glob
import os
from pathlib import Path
import re
import plotly.express as px
from collections import Counter
pd.set_option('display.max_colwidth', 1000)
import nltk
import gensim

### Loading Data

In [3]:
repo = Path(os.getcwd())
data_path = repo / '..' / 'data'

In [4]:
df = pd.read_csv(data_path / '01_raw' / 'train.csv')

### Feature Engineering

In [5]:
# mention features
df['mentions'] = df['text'].apply(lambda x: re.findall(r'@([A-Za-z0-9_]+)', x))
df['mentions_cnt'] = df['text'].apply(lambda x: re.findall(r'@([A-Za-z0-9_]+)', x)).str.len()
df['mentions_bool'] = np.where(df['mentions_cnt'] >= 1, 1, 0)

In [6]:
# hashtag features
df['hashtags'] = df['text'].apply(lambda x: re.findall(r'#([A-Za-z0-9_]+)', x))
df['hashtags_cnt'] = df['text'].apply(lambda x: re.findall(r'#([A-Za-z0-9_]+)', x)).str.len()
df['hashtags_bool'] = np.where(df['hashtags_cnt'] >= 1, 1, 0)

In [7]:
# link features 
df['links'] = df['text'].apply(lambda x: re.findall(r'http:\/\/t.co\/[a-zA-Z0-9\-\.]{8}', x))
df['links_cnt'] = df['text'].apply(lambda x: re.findall(r'http:\/\/t.co\/[a-zA-Z0-9\-\.]{8}', x)).str.len()
df['links_bool'] = np.where(df['links_cnt'] >= 1, 1, 0)

In [8]:
# percentage of characters in tweet that are lower case, upper case, title case( first letter upper, rest lower) 
df['uprchar_pct'] = df['text'].str.count(r'[A-Z]') / df['text'].str.len()
df['lwrchar_pct'] = df['text'].str.count(r'[a-z]') / df['text'].str.len()

In [9]:
def low_pct(x):
    tweet = x.split()
    lower_cases = [word for word in tweet if word.islower()]
    return len(lower_cases) / len(tweet)

def up_pct(x):
    tweet = x.split()
    upper_cases = [word for word in tweet if word.isupper()]
    return len(upper_cases) / len(tweet)

def title_pct(x):
    tweet = x.split()
    title_cases = [word for word in tweet if word.istitle()]
    return len(title_cases) / len(tweet)

In [10]:
# percentage of words in tweet that are lower case, upper case, title case( first letter upper, rest lower) 
df['lwrcase_pct'] = df['text'].apply(lambda x: low_pct(x))
df['upprcase_pct'] = df['text'].apply(lambda x: up_pct(x))
df['titlecase_pct'] = df['text'].apply(lambda x: title_pct(x))

In [11]:
df.sample(1)

Unnamed: 0,id,keyword,location,text,target,mentions,mentions_cnt,mentions_bool,hashtags,hashtags_cnt,hashtags_bool,links,links_cnt,links_bool,uprchar_pct,lwrchar_pct,lwrcase_pct,upprcase_pct,titlecase_pct
6688,9582,thunder,,Thunder lightening torrential rain and a power cut!,1,[],0,0,[],0,0,[],0,0,0.019608,0.823529,0.875,0.0,0.125


### Data Cleaning

In [12]:
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = [',','(',')','?','[',']','$','.',':',"'s",'of','the',
                '!','"','"','/']
stopwords.extend(newStopWords)
stopwords = list(stopwords)

In [19]:
tweet_col = []
for tweet in list(df.text):
    tweet = re.sub(r'@([A-Za-z0-9_]+)', '__mention__ ', tweet)
    tweet = re.sub(r'#([A-Za-z0-9_]+)', '__hashtag__ ', tweet)
    tweet = re.sub(r'https?://[^\s<>"]+|www\.[^\s<>"]+', '__link__ ',tweet)
    tweet = list(gensim.utils.tokenize(tweet))
    tweet = [word.lower() for word in tweet if word.lower() not in stopwords]
    tweet = ' '.join(tweet)
    tweet_col.append(tweet)
df['tweet'] = tweet_col

In [20]:
df.head(1)

Unnamed: 0,id,keyword,location,text,target,mentions,mentions_cnt,mentions_bool,hashtags,hashtags_cnt,hashtags_bool,links,links_cnt,links_bool,uprchar_pct,lwrchar_pct,lwrcase_pct,upprcase_pct,titlecase_pct,tweet
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,[],0,0,[earthquake],1,1,[],0,0,0.144928,0.666667,0.538462,0.076923,0.384615,deeds reason __hashtag__ may allah forgive us


In [21]:
df.to_csv(data_path / '02_clean' / 'train.csv', index=False)