## This Notebook Contains Feature Engineering for Tweet Data

In [1]:
import pandas as pd
import numpy as np
import statistics
import Levenshtein as lev
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('All_Tweets.csv')

  interactivity=interactivity, compiler=compiler, result=result)


### Dropping Features with Little to No Data

In [3]:
df = df.drop(['favorited', 'retweeted', 'contributors', 'place', 'geo', 'truncated', 'Unnamed: 0'], axis=1)

### Engineering Features

In [4]:
#in_reply_to_user_id grouping by if the count is greater than 5000 or not
gp = df.in_reply_to_user_id.value_counts().to_frame().reset_index()
gp.columns = ['in_reply_to_user_id', 'counts']
df = df.merge(gp, on='in_reply_to_user_id', how='left')

#Set threshold and adjust orignal column, drop merged column
df['in_reply_to_user_id'] = np.where(df['counts']>5000, 0, 1)
df = df.drop(['counts'], axis=1)

#### Creating Feature for contains RT or @

In [5]:
# String to be searched in start of string  
search ="RT"
search2 ="@"
  
# boolean series returned 
df['retweet_y_n'] = df["text"].str.startswith(search) 
df['contains_@'] = df["text"].str.contains(search2)

#Replace Bool with int
df['retweet_y_n'] = df['retweet_y_n'].replace(True, 1) 
df['retweet_y_n'] = df['retweet_y_n'].replace(False, 0) 
df['contains_@'] = df['contains_@'].replace(True, 1) 
df['contains_@'] = df['contains_@'].replace(False, 0) 

#### Features for max/min/mean/std of retweet and favorite count, more features on source mode, in_reply_to_user_id mean & a few more

In [6]:
cols = ['user_id', 'retweet_count', 'favorite_count']

df1 = df[cols].groupby('user_id').mean() #retweet & favorite count mean for each user
df2 = df[cols].groupby('user_id').max() #retweet & favorite count max for each user
df3 = df[cols].groupby('user_id').min() #retweet & favorite count min for each user
df4 = df[cols].groupby('user_id').std() #retweet & favorite count std for each user
df5 = df.groupby(['user_id'])['source'].agg(pd.Series.mode).to_frame() #source mode for each user
df6 = df.groupby('user_id')['in_reply_to_user_id'].mean() #in_reply_to_any_user_id percentage of tweets
df7 = df.groupby('user_id')['retweet_y_n'].mean() #retweet percentage
df8 = df.groupby('user_id')['contains_@'].mean() #@ someone percentage

df_features = pd.merge(df1,df2,on='user_id')
df_features = pd.merge(df_features, df3, on= 'user_id')
df_features = pd.merge(df_features, df4, on= 'user_id')
df_features = pd.merge(df_features, df5, on='user_id')
df_features = pd.merge(df_features, df6, on='user_id')
df_features = pd.merge(df_features, df7, on='user_id')
df_features = pd.merge(df_features, df8, on='user_id')

#new df column names
cols = ['retweet_mean', 'favorite_count_mean', 'retweet_max', 'favorite_max', 'retweet_min', 'favorite_min', 'retweet_std', 'favorite_std', 'source_mode', 'in_reply_user_id_mean', 'retweet_%', 'contains_@_%']

df_features.columns = cols
df_features = df_features.reset_index()
df_features.head(2)

Unnamed: 0,user_id,retweet_mean,favorite_count_mean,retweet_max,favorite_max,retweet_min,favorite_min,retweet_std,favorite_std,source_mode,in_reply_user_id_mean,retweet_%,contains_@_%
0,586.0,13897.8,11.566667,183502.0,178.0,0.0,0.0,34847.116648,42.387119,Twitter Web App,1.0,0.866667,1.0
1,8557.0,985.066667,1.5,26777.0,39.0,0.0,0.0,4890.865419,7.094194,Twitter for iPhone,1.0,0.466667,0.7


#### Dropping a few columns from original df to make it easier to see what I have left to work with

In [7]:
cols = ['id', 'retweet_count', 'favorite_count', 'source', 'in_reply_to_user_id', 'in_reply_to_screen_name', 'in_reply_to_status_id', 'retweet_y_n', 'contains_@']
df = df.drop(cols, axis=1)

In [8]:
df.head(2)

Unnamed: 0,text,user_id,created_at
0,RT @morningJewshow: Speaking about Jews and co...,678033.0,Fri May 01 00:18:11 +0000 2015
1,This age/face recognition thing..no reason pla...,678033.0,Thu Apr 30 21:50:52 +0000 2015


In [9]:
df_features.source_mode.value_counts()

<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                      3620
Twitter for iPhone                                                                       978
<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>       493
Twitter Web App                                                                          453
<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>     203
                                                                                        ... 
<a href="http://www.lucianaDania.com" rel="nofollow">lucianaDania</a>                      1
<a href="http://www.sharedby.co" rel="nofollow">SharedBy</a>                               1
<a href="http://www.zoarib.it" rel="nofollow">ziarib</a>                                   1
<a href="http://www.politicoblu.com" rel="nofollow">politicoblu</a>                        1
[Hootsuite Inc., Twitter Web App]                                     

Trying to deal with Source Mode...

In [10]:
df_features['source_mode'] = df_features.source_mode.astype(str)

In [11]:
#in_reply_to_user_id grouping by if the count is greater than 5000 or not
gp = df_features.source_mode.value_counts().to_frame().reset_index()
gp.columns = ['source_mode', 'counts']
df_features = df_features.merge(gp, on='source_mode', how='left')

#Set threshold and adjust orignal column, drop merged column
df_features['source_mode'] = np.where(df_features['counts']<100, '1', df_features['source_mode'])
df_features = df_features.drop(['counts'], axis=1)

In [12]:
df_features['source_mode'] = df_features.source_mode.replace('<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>', 'Twitter Web App')
df_features['source_mode'] = df_features.source_mode.replace('1', 'Other')
df_features['source_mode'] = df_features.source_mode.replace('<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'Twitter for iPhone')
df_features['source_mode'] = df_features.source_mode.replace('<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'Twitter for Android')

In [13]:
df_features.source_mode.value_counts()

Twitter Web App        4073
Other                  1481
Twitter for iPhone     1471
Twitter for Android     345
TweetDeck               146
Name: source_mode, dtype: int64

In [33]:
df_features.head()

Unnamed: 0,user_id,retweet_mean,favorite_count_mean,retweet_max,favorite_max,retweet_min,favorite_min,retweet_std,favorite_std,source_mode,in_reply_user_id_mean,retweet_%,contains_@_%
0,586.0,13897.8,11.566667,183502.0,178.0,0.0,0.0,34847.116648,42.387119,Twitter Web App,1.0,0.866667,1.0
1,8557.0,985.066667,1.5,26777.0,39.0,0.0,0.0,4890.865419,7.094194,Twitter for iPhone,1.0,0.466667,0.7
2,12522.0,20.233333,0.6,581.0,2.0,0.0,0.0,105.977448,0.621455,TweetDeck,1.0,0.133333,0.966667
3,612473.0,60.533333,86.9,630.0,1070.0,1.0,0.0,118.0566,234.915737,Other,1.0,0.4,0.5
4,652193.0,43.133333,11.533333,1035.0,70.0,0.0,0.0,188.214429,17.093522,Other,1.0,0.233333,0.533333


### Other Ideas

In [9]:
#Lev Distance Between Users Tweets
#Sentiment Analysis of Users Tweet?
#What Can we do with Time?

### Merging User And New Engineered Feature Data

In [14]:
df_train = pd.read_csv('Train.csv')

In [15]:
df_train.target.value_counts()

0    5295
1    4448
Name: target, dtype: int64

In [16]:
df_final = df_features.merge(df_train, left_on='user_id', right_on='id')

In [17]:
df_final.to_csv('FINAL_DF.csv')