## Data Clean and Engineer for User Data

In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import gc; gc.enable()

## Import Data & Merge Tables

In [2]:
#Import Real Tweet Data
df_tweets_real = pd.read_csv('tweets_real.csv')
df_users_real = pd.read_csv('users_real.csv')

#Drop columns that dont have much data
df_users_real = df_users_real.drop(['test_set_2'], axis=1)
df_users_real = df_users_real.drop(['timestamp'], axis=1)
df_users_real = df_users_real.drop(['crawled_at'], axis=1)
df_users_real = df_users_real.drop(['test_set_1'], axis=1)

#Add column to df_users with target
df_users_real['target'] = 0 #isreal

#Import social spambot Data
df_users_spambot1 = pd.read_csv('users_social_spambot1.csv')
df_users_spambot1 = df_users_spambot1.drop(['test_set_1'], axis=1)
df_users_spambot1 = df_users_spambot1.drop(['timestamp'], axis=1)
df_users_spambot1 = df_users_spambot1.drop(['crawled_at'], axis=1)
df_users_spambot2 = pd.read_csv('users_social_spambot2.csv')
df_users_spambot2 = df_users_spambot2.drop(['timestamp'], axis=1)
df_users_spambot2 = df_users_spambot2.drop(['crawled_at'], axis=1)
df_users_spambot = pd.concat([df_users_spambot1, df_users_spambot2])

#Add column to df_users with target
df_users_spambot['target'] = 1 #isSpamBot

#Import Fake Follwers Accounts (NOT USING FOR NOW)
# df_users_fake = pd.read_csv('users_fake_followers.csv')
# df_tweets_fake = pd.read_csv('tweets_fake_followers.csv')

#Add column to df_users with target
# df_users_fake['target'] = 2 #isFakeFollower

#merge the tables above
# df_users = pd.concat([df_users_real, df_users_spambot, df_users_fake])
df_users = pd.concat([df_users_real, df_users_spambot])
# df_tweets = pd.concat([df_tweets_real, df_tweets_spambot])

## Dropping Columns with Bad Data/Useless Data

In [3]:
#Droppping A Few Columns That that have very few data points
df_users = df_users.drop(['is_translator', 'profile_background_tile', 'default_profile_image', 'follow_request_sent', 'protected', 'verified', 'notifications', 'contributors_enabled', 'following', 'updated', 'profile_use_background_image'], axis=1)

#Dropping columns that are entirely or mostly unique
df_users = df_users.drop(['screen_name', 'name', 'profile_image_url', 'profile_image_url_https'], axis=1)

#If a link was provided then = 1 else = 0
df_users['url'] = df_users['url'].isna()
df_users['url'] = df_users['url'].apply(lambda x: 0 if x == True else 1)

df_users['profile_banner_url'] = df_users['profile_banner_url'].isna()
df_users['profile_banner_url'] = df_users['profile_banner_url'].apply(lambda x: 0 if x == True else 1)

#replaced na's with 0, trues are 1
df_users['default_profile'] = df_users['default_profile'].fillna(0)
df_users['geo_enabled'] = df_users['geo_enabled'].fillna(0)

## More real user data gathered from twitter API

In [4]:
#MORE REAL USERS!!! From Twitter API
df_users_real2 = pd.read_csv('more_real_users.csv')
df_users_real2 = df_users_real2.drop(['Unnamed: 0', 'profile_background_title' ], axis=1)
df_users_real2['target'] = 0 #isreal

df_users_real2['url'] = df_users_real2['url'].isna()
df_users_real2['url'] = df_users_real2['url'].apply(lambda x: 0 if x == True else 1)

df_users_real2['profile_banner_url'] = df_users_real2['profile_banner_url'].isna()
df_users_real2['profile_banner_url'] = df_users_real2['profile_banner_url'].apply(lambda x: 0 if x == True else 1)

#replaced na's with 0, trues are 1
df_users_real2['default_profile'] = df_users_real2['default_profile'].fillna(0)
df_users_real2['geo_enabled'] = df_users_real2['geo_enabled'].fillna(0)

#replaced na's with 0, trues are 1
df_users_real2['default_profile'] = df_users_real2['default_profile'].replace(True, 1)
df_users_real2['default_profile'] = df_users_real2['default_profile'].replace(False, 0)
df_users_real2['geo_enabled'] = df_users_real2['geo_enabled'].replace(True, 1)
df_users_real2['geo_enabled'] = df_users_real2['geo_enabled'].replace(False, 0)

## Preparing Final Dataset & Sending to CSV

In [5]:
df_final = pd.concat([df_users, df_users_real2])
df_final['profile_background_image_url_https'] = df_final['profile_background_image_url_https'].fillna(df_final['profile_background_image_url_https'].mode()[0])
df_final['profile_background_image_url'] = df_final['profile_background_image_url'].fillna(df_final['profile_background_image_url'].mode()[0])

In [31]:
# #This code Adjusts Unique Values for Different Features
# #Create new feature with counts and merge onto df
# gp = df_final.profile_background_image_url_https.value_counts().to_frame().reset_index()
# gp.columns = ['profile_background_image_url_https', 'counts']
# df_final = df_final.merge(gp, on='profile_background_image_url_https', how='left')

# #Set threshold and adjust orignal column, drop merged column
# df_final['profile_background_image_url_https'] = np.where(df_final['counts']<50, '1', df_final['profile_background_image_url_https'])
# df_final = df_final.drop(['counts'], axis=1)

# #NEXT
# gp = df_final.profile_text_color.value_counts().to_frame().reset_index()
# gp.columns = ['profile_text_color', 'counts']
# df_final = df_final.merge(gp, on='profile_text_color', how='left')

# #Set threshold and adjust orignal column, drop merged column
# df_final['profile_text_color'] = np.where(df_final['counts']<25, '1', df_final['profile_text_color'])
# df_final = df_final.drop(['counts'], axis=1)

# #NEXT
# gp = df_final.profile_sidebar_border_color.value_counts().to_frame().reset_index()
# gp.columns = ['profile_sidebar_border_color', 'counts']
# df_final = df_final.merge(gp, on='profile_sidebar_border_color', how='left')

# #Set threshold and adjust orignal column, drop merged column
# df_final['profile_sidebar_border_color'] = np.where(df_final['counts']<55, '1', df_final['profile_sidebar_border_color'])
# df_final = df_final.drop(['counts'], axis=1)

# #NEXT
# gp = df_final.profile_sidebar_fill_color.value_counts().to_frame().reset_index()
# gp.columns = ['profile_sidebar_fill_color', 'counts']
# df_final = df_final.merge(gp, on='profile_sidebar_fill_color', how='left')

# #Set threshold and adjust orignal column, drop merged column
# df_final['profile_sidebar_fill_color'] = np.where(df_final['counts']<65, '1', df_final['profile_sidebar_fill_color'])
# df_final = df_final.drop(['counts'], axis=1)
# df_final.profile_sidebar_fill_color.value_counts()

# #NEXT
# gp = df_final.profile_background_image_url.value_counts().to_frame().reset_index()
# gp.columns = ['profile_background_image_url', 'counts']
# df_final = df_final.merge(gp, on='profile_background_image_url', how='left')

# #Set threshold and adjust orignal column, drop merged column
# df_final['profile_background_image_url'] = np.where(df_final['counts']<65, '1', df_final['profile_background_image_url'])
# df_final = df_final.drop(['counts'], axis=1)

# #NEXT
# gp = df_final.profile_background_color.value_counts().to_frame().reset_index()
# gp.columns = ['profile_background_color', 'counts']
# df_final = df_final.merge(gp, on='profile_background_color', how='left')

# #Set threshold and adjust orignal column, drop merged column
# df_final['profile_background_color'] = np.where(df_final['counts']<65, '1', df_final['profile_background_color'])
# df_final = df_final.drop(['counts'], axis=1)

# #NEXT
# gp = df_final.profile_link_color.value_counts().to_frame().reset_index()
# gp.columns = ['profile_link_color', 'counts']
# df_final = df_final.merge(gp, on='profile_link_color', how='left')

# #Set threshold and adjust orignal column, drop merged column
# df_final['profile_link_color'] = np.where(df_final['counts']<65, '1', df_final['profile_link_color'])
# df_final = df_final.drop(['counts'], axis=1)

In [7]:
#This for loop loops through the below columns and makes them binary, either ARE YOU UNIQUE == 1, if not == 0

cols_to_binary = ['profile_background_image_url_https','profile_text_color','profile_sidebar_border_color','profile_sidebar_fill_color',
                 'profile_background_image_url', 'profile_background_color', 'profile_link_color']
for col in cols_to_binary:
    gp = df_final[col].value_counts().to_frame().reset_index()
    gp.columns = [col, 'counts']
    df_final = df_final.merge(gp, on=col, how='left')

    #Set threshold and adjust orignal column, drop merged column
    df_final[col] = np.where(df_final['counts']==1, 1, 0)
    df_final = df_final.drop(['counts'], axis=1)

In [8]:
df_final.target.value_counts()

0    5295
1    4448
Name: target, dtype: int64

In [10]:
df_final.to_csv('Train.csv')