# **Instructions**

To generate the train/test files, do the following:

*   To generate train files: Set **generateTrain** to **True** and **generateTest** to **False** and go to Runtime -> Run all
*   To generate test files: Set **generateTest** to **True** and **generateTrain** to **False** and go to Runtime -> Run all

Please Generate Train First and then Test

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import LabelEncoder

In [23]:
generateTrain = False
generateTest = True

if generateTrain == generateTest:
  print("Can't generate both of them at the same time, generating Train..")
  generateTrain = True
  generateTest = False

if generateTrain :
  df_train = pd.read_csv("train.csv")
  df_train_copy = df_train.copy()


if generateTest :
  df_train = pd.read_csv("test.csv")
  df_train_copy = df_train.copy()

In [24]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 23 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Id                                           2500 non-null   object 
 1   User Name                                    2500 non-null   object 
 2   Personal URL                                 1038 non-null   object 
 3   Profile Cover Image Status                   2480 non-null   object 
 4   Profile Verification Status                  2500 non-null   object 
 5   Profile Text Color                           2464 non-null   object 
 6   Profile Page Color                           2473 non-null   object 
 7   Profile Theme Color                          2477 non-null   object 
 8   Is Profile View Size Customized?             2500 non-null   bool   
 9   UTC Offset                                   2352 non-null   float64
 10  

**Dropping classes**

In [25]:
df_train.drop(labels=["User Name", "Location", "User Time Zone"], axis=1, inplace=True)

**Converting URL to binary**

In [26]:
df_train["Personal URL"] = df_train['Personal URL'].replace('.*', 1, regex=True)
df_train["Personal URL"] = df_train['Personal URL'].replace(np.nan, 0, regex=True)

df_train

Unnamed: 0,Id,Personal URL,Profile Cover Image Status,Profile Verification Status,Profile Text Color,Profile Page Color,Profile Theme Color,Is Profile View Size Customized?,UTC Offset,Location Public Visibility,User Language,Profile Creation Timestamp,Num of Followers,Num of People Following,Num of Status Updates,Num of Direct Messages,Profile Category,Avg Daily Profile Visit Duration in seconds,Avg Daily Profile Clicks,Profile Image
0,49I3SOKLI2CMNGP4,0.0,Set,Not verified,fa0a86,fc37c4,0a0101,False,-18000.0,Enabled,en,Mon Jul 20 21:05:24 +0000 2009,31528,2148,12926,469,business,13.827,3.4062,49I3SOKLI2CMNGP4.png
1,727IRIR59A3P88LK,0.0,Not set,Not verified,ff0000,e6e6e6,001941,False,36000.0,Enabled,en,Wed Aug 05 22:31:34 +0000 2009,42857,262,11678,1806,government,24.019,3.2656,727IRIR59A3P88LK.png
2,LN95SD15SRPCEE8F,1.0,Set,Verified,0d0101,000000,000000,False,-25200.0,Enabled,en,Fri Mar 18 18:36:02 +0000 2011,40237847,392,9395,59247,unknown,26.011,12.1619,LN95SD15SRPCEE8F.png
3,TB11I7F0PN033D4T,0.0,Set,Verified,0000ff,e0ff92,9ae4e8,False,-25200.0,??,en,Sun Apr 08 05:45:46 +0000 2007,230166,702,10507,2695,celebrity,26.127,2.1131,TB11I7F0PN033D4T.png
4,32PSGCK5PATHMR07,1.0,Set,Not verified,992f09,ddeef6,ffffff,False,,Enabled,en,Mon Jun 04 19:51:04 +0000 2012,1199,1701,1022,8,unknown,17.878,3.9660,32PSGCK5PATHMR07.png
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,7SDJE48EFRPPNEJK,1.0,Set,Verified,ffcc4d,ddeef6,080906,False,-18000.0,Disabled,en,Thu Oct 22 15:37:48 +0000 2009,2997645,879,7287,9429,unknown,19.496,5.4646,7SDJE48EFRPPNEJK.png
2496,PRT8RDNG6E86518P,0.0,Set,Not verified,1200f0,78c9f2,000000,False,-14400.0,Enabled,en,Tue Jul 15 20:30:25 +0000 2008,25528,5833,17282,912,government,13.185,1.8277,PRT8RDNG6E86518P.png
2497,SRNIBIK27BQ2M3PB,0.0,Not set,Not verified,0084b4,f6ffd1,9ae4e8,False,-10800.0,Enabled,pt,Thu Jul 21 13:38:14 +0000 2011,375639,838,52308,727,business,22.497,7.1340,SRNIBIK27BQ2M3PB.png
2498,6CP232J9R8N84702,0.0,Set,Not verified,69bf19,fff2c0,c1272c,False,-25200.0,Disabled,en,Thu Apr 02 16:46:07 +0000 2009,10376,707,2028,316,unknown,26.010,5.2116,6CP232J9R8N84702.png


**Managing TimeStamp**

In [27]:
trips_start = df_train[["Id", "Profile Creation Timestamp"]].copy()
trips_start["Profile Creation Timestamp"] = pd.to_datetime(trips_start["Profile Creation Timestamp"])

#https://www.w3resource.com/python-exercises/pandas/datetime/pandas-datetime-exercise-8.php
trips_start["Profile Creation Day"] = trips_start["Profile Creation Timestamp"].dt.day
trips_start["Profile Creation WeekDay"] = trips_start["Profile Creation Timestamp"].dt.day_name()
trips_start["Profile Creation Month"] = trips_start["Profile Creation Timestamp"].dt.month
trips_start["Profile Creation Year"] = trips_start["Profile Creation Timestamp"].dt.year
trips_start["Profile Creation Hour"] = trips_start["Profile Creation Timestamp"].dt.hour
trips_start = trips_start.drop(columns=["Profile Creation Timestamp"])

df_train = df_train.join(trips_start.set_index('Id'), on='Id')

#Here, this is an totally optional step. I moved the 3 features added to the dataframe to the front of the dataframe to make it easy to see them when printing
#https://stackoverflow.com/questions/25122099/move-column-by-name-to-front-of-table-in-pandas
for col_name in ["Profile Creation Hour", "Profile Creation WeekDay", "Profile Creation Day", "Profile Creation Month", "Profile Creation Year"]:
  mid = df_train[col_name]
  df_train.drop(labels=col_name, axis=1, inplace = True)
  df_train.insert(10, col_name, mid)

df_train.drop(columns=["Profile Creation Timestamp"], inplace=True)

**Converting Categories to numeric**

In [28]:
cover_encoder = LabelEncoder()
df_train['Profile Cover Image Status'] = df_train['Profile Cover Image Status'].replace(np.nan, "unknown", regex=True)
df_train['Profile Cover Image Status'] = cover_encoder.fit_transform(df_train['Profile Cover Image Status'])
print(cover_encoder.classes_)

location_encoder = LabelEncoder()
df_train["Location Public Visibility"] = df_train['Location Public Visibility'].replace(np.nan, '??', regex=True)
df_train['Location Public Visibility'] = df_train['Location Public Visibility'].str.lower()
df_train["Location Public Visibility"] = location_encoder.fit_transform(df_train["Location Public Visibility"])
print( location_encoder.classes_)

view_size_encoder = LabelEncoder()
df_train["Is Profile View Size Customized?"] = view_size_encoder.fit_transform(df_train["Is Profile View Size Customized?"])
print( view_size_encoder.classes_)

language_encoder = LabelEncoder()
df_train["User Language"] = df_train['User Language'].str.lower()
if generateTrain:

  df_train["User Language"] = language_encoder.fit_transform(df_train["User Language"])
  print( language_encoder.classes_)
  language_dict = {value: key for (key, value) in enumerate(language_encoder.classes_)}
else:
  df_train["User Language"] = df_train["User Language"].apply(lambda x: language_dict[x] if x in language_dict else np.nan)


category_encoder = LabelEncoder()
df_train["Profile Category"] = df_train['Profile Category'].replace(np.nan, 'unknown', regex=True)
df_train["Profile Category"] = df_train['Profile Category'].replace(' ', 'unknown', regex=True)
df_train['Profile Category'] = df_train['Profile Category'].str.lower()

df_train["Profile Creation WeekDay"] = df_train["Profile Creation WeekDay"].map({'Monday':0, 'Tuesday':1, 'Wenesday':2, 'Thursday':3, "Friday":4, "Saturday":5, "Sunday":6})
in_order_weekdays = ["Monday", "Tuesday", "Wenesday", "Thursday", "Friday", "Saturday", "Sunday"]
print(in_order_weekdays)



['Not set' 'Set' 'unknown']
['??' 'disabled' 'enabled']
[False  True]
['Monday', 'Tuesday', 'Wenesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']


In [29]:
df_train

Unnamed: 0,Id,Personal URL,Profile Cover Image Status,Profile Verification Status,Profile Text Color,Profile Page Color,Profile Theme Color,Is Profile View Size Customized?,UTC Offset,Location Public Visibility,...,Profile Creation Hour,User Language,Num of Followers,Num of People Following,Num of Status Updates,Num of Direct Messages,Profile Category,Avg Daily Profile Visit Duration in seconds,Avg Daily Profile Clicks,Profile Image
0,49I3SOKLI2CMNGP4,0.0,1,Not verified,fa0a86,fc37c4,0a0101,0,-18000.0,2,...,21,6.0,31528,2148,12926,469,business,13.827,3.4062,49I3SOKLI2CMNGP4.png
1,727IRIR59A3P88LK,0.0,0,Not verified,ff0000,e6e6e6,001941,0,36000.0,2,...,22,6.0,42857,262,11678,1806,government,24.019,3.2656,727IRIR59A3P88LK.png
2,LN95SD15SRPCEE8F,1.0,1,Verified,0d0101,000000,000000,0,-25200.0,2,...,18,6.0,40237847,392,9395,59247,unknown,26.011,12.1619,LN95SD15SRPCEE8F.png
3,TB11I7F0PN033D4T,0.0,1,Verified,0000ff,e0ff92,9ae4e8,0,-25200.0,0,...,5,6.0,230166,702,10507,2695,celebrity,26.127,2.1131,TB11I7F0PN033D4T.png
4,32PSGCK5PATHMR07,1.0,1,Not verified,992f09,ddeef6,ffffff,0,,2,...,19,6.0,1199,1701,1022,8,unknown,17.878,3.9660,32PSGCK5PATHMR07.png
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,7SDJE48EFRPPNEJK,1.0,1,Verified,ffcc4d,ddeef6,080906,0,-18000.0,1,...,15,6.0,2997645,879,7287,9429,unknown,19.496,5.4646,7SDJE48EFRPPNEJK.png
2496,PRT8RDNG6E86518P,0.0,1,Not verified,1200f0,78c9f2,000000,0,-14400.0,2,...,20,6.0,25528,5833,17282,912,government,13.185,1.8277,PRT8RDNG6E86518P.png
2497,SRNIBIK27BQ2M3PB,0.0,0,Not verified,0084b4,f6ffd1,9ae4e8,0,-10800.0,2,...,13,18.0,375639,838,52308,727,business,22.497,7.1340,SRNIBIK27BQ2M3PB.png
2498,6CP232J9R8N84702,0.0,1,Not verified,69bf19,fff2c0,c1272c,0,-25200.0,1,...,16,6.0,10376,707,2028,316,unknown,26.010,5.2116,6CP232J9R8N84702.png


**Converting colors to numeric**

In [30]:
def hex2decimal(hex_value):

  if len(str(hex_value)) == 6:
    hex = str(hex_value)
    r = "{:.2f}".format(float(int(hex[0:2], 16))/255.0)
    g = "{:.2f}".format(float(int(hex[2:4], 16))/255.0)
    b = "{:.2f}".format(float(int(hex[4:6], 16))/255.0)

    return str(r) + "|" + str(g) + "|" + str(b)
  else:
    return hex_value



In [31]:
df_train['Profile Text Color'] = df_train['Profile Text Color'].apply(lambda x: x if(len(str(x)) == 6) else np.nan)
df_train['Profile Text Color'] = df_train['Profile Text Color'].apply(lambda x: hex2decimal(x))
new_cols = df_train['Profile Text Color'].str.split("|", expand = True)

df_train.insert(df_train.columns.get_loc("Profile Text Color"), "Profile Text Color R", new_cols[0])
df_train.insert(df_train.columns.get_loc("Profile Text Color"), "Profile Text Color G", new_cols[1])
df_train.insert(df_train.columns.get_loc("Profile Text Color"), "Profile Text Color B", new_cols[2])

df_train.drop(columns=["Profile Text Color"], inplace=True)

In [32]:
df_train['Profile Page Color'] = df_train['Profile Page Color'].apply(lambda x: x if(len(str(x)) == 6) else np.nan)
df_train['Profile Page Color'] = df_train['Profile Page Color'].apply(lambda x: hex2decimal(x))
new_cols = df_train['Profile Page Color'].str.split("|", expand = True)

df_train.insert(df_train.columns.get_loc("Profile Page Color"), "Profile Page Color R", new_cols[0])
df_train.insert(df_train.columns.get_loc("Profile Page Color"), "Profile Page Color G", new_cols[1])
df_train.insert(df_train.columns.get_loc("Profile Page Color"), "Profile Page Color B", new_cols[2])

df_train.drop(columns=["Profile Page Color"], inplace=True)

In [33]:
df_train['Profile Theme Color'] = df_train['Profile Theme Color'].apply(lambda x: x if(len(str(x)) == 6) else np.nan)
df_train['Profile Theme Color'] = df_train['Profile Theme Color'].apply(lambda x: hex2decimal(x))
new_cols = df_train['Profile Theme Color'].str.split("|", expand = True)

df_train.insert(df_train.columns.get_loc("Profile Theme Color"), "Profile Theme Color R", new_cols[0])
df_train.insert(df_train.columns.get_loc("Profile Theme Color"), "Profile Theme Color G", new_cols[1])
df_train.insert(df_train.columns.get_loc("Profile Theme Color"), "Profile Theme Color B", new_cols[2])

df_train.drop(columns=["Profile Theme Color"], inplace=True)

**Change unkown values in columns to NAN**


In [34]:
df_train["Profile Cover Image Status"].replace(2, np.nan, inplace=True)
df_train["Location Public Visibility"].replace(0, np.nan, inplace=True)


**Rounding Daily Profile Visits and Clicks**

In [35]:
df_train["Avg Daily Profile Visit Duration in seconds"] = df_train["Avg Daily Profile Visit Duration in seconds"].round()
df_train["Avg Daily Profile Clicks"] = df_train["Avg Daily Profile Clicks"].round()

**Filling NAN values with appropriate replacement (Median)**

In [36]:
df_train = df_train.fillna(df_train.median())

**Outlier deletion**

In [37]:
def delete_outliers(df, name_of_column):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1

    mask = ((df > (Q1 - 1.5 * IQR)) & (df < (Q3 + 1.5 * IQR)))
    return df[mask[name_of_column]]

def correct_ouliers(df, name_of_column):
    upper_lim = df[name_of_column].quantile(.95)
    lower_lim = df[name_of_column].quantile(.05)

    df.loc[(df[name_of_column] > upper_lim), name_of_column] = upper_lim
    df.loc[(df[name_of_column] < lower_lim), name_of_column] = lower_lim
    return df

if generateTrain :
  continuous_columns = ["Num of Followers", "Num of People Following", "Num of Status Updates", "Num of Direct Messages", "Avg Daily Profile Visit Duration in seconds", "Avg Daily Profile Clicks", "Num of Profile Likes"]
  for c in continuous_columns:
      df_train = delete_outliers(df_train, c)

**Normalizing continuous values using log10**

In [38]:
if generateTrain :
  continuous_columns = ["Num of Followers", "Num of People Following", "Num of Status Updates", "Num of Direct Messages", "Avg Daily Profile Visit Duration in seconds", "Avg Daily Profile Clicks", "Num of Profile Likes"]

if generateTest :
  continuous_columns = ["Num of Followers", "Num of People Following", "Num of Status Updates", "Num of Direct Messages", "Avg Daily Profile Visit Duration in seconds", "Avg Daily Profile Clicks"]

for c in continuous_columns:
  df_train[c] = np.log10(df_train[c] + 1)

**Convert Columns to appropriate type**

In [39]:

df_train = df_train.astype({'Profile Cover Image Status': 'int64', 
                 'Personal URL': 'int64',
                 'UTC Offset': 'int64',
                 'Location Public Visibility': 'int64',
                 'Profile Creation WeekDay': 'int64',
                 "Profile Text Color R": 'float64',
                 "Profile Text Color G": 'float64',
                 "Profile Text Color B": 'float64',
                 "Profile Page Color R": 'float64',
                 "Profile Page Color G": 'float64',
                 "Profile Page Color B": 'float64',
                 "Profile Theme Color R": 'float64',
                 "Profile Theme Color G": 'float64',
                 "Profile Theme Color B": 'float64'
                 })


**Onehot encoding**

In [40]:
encoded_columns = pd.get_dummies(df_train['Profile Creation Year'])
df_train = df_train.join(encoded_columns).drop('Profile Creation Year', axis=1)

df_train['Location Public Visibility'] = df_train['Location Public Visibility'].apply(lambda x: 0 if(x==1) else 1)

encoded_columns = pd.get_dummies(df_train['UTC Offset'])
df_train = df_train.join(encoded_columns).drop('UTC Offset', axis=1)
encoded_columns = pd.get_dummies(df_train['Profile Verification Status'])
df_train = df_train.join(encoded_columns).drop('Profile Verification Status', axis=1)
encoded_columns = pd.get_dummies(df_train['Profile Category'])
df_train = df_train.join(encoded_columns).drop('Profile Category', axis=1)


if generateTrain :
  tmp = df_train['Num of Profile Likes']
  df_train = df_train.drop('Num of Profile Likes', axis=1)
  df_train = df_train.join(tmp)

**Saving Model to CSV and Pkl**

In [41]:
df_train

Unnamed: 0,Id,Personal URL,Profile Cover Image Status,Profile Text Color R,Profile Text Color G,Profile Text Color B,Profile Page Color R,Profile Page Color G,Profile Page Color B,Profile Theme Color R,...,37800,39600,46800,Not verified,Pending,Verified,business,celebrity,government,unknown
0,49I3SOKLI2CMNGP4,0,1,0.98,0.04,0.53,0.99,0.22,0.77,0.04,...,0,0,0,1,0,0,1,0,0,0
1,727IRIR59A3P88LK,0,0,1.00,0.00,0.00,0.90,0.90,0.90,0.00,...,0,0,0,1,0,0,0,0,1,0
2,LN95SD15SRPCEE8F,1,1,0.05,0.00,0.00,0.00,0.00,0.00,0.00,...,0,0,0,0,0,1,0,0,0,1
3,TB11I7F0PN033D4T,0,1,0.00,0.00,1.00,0.88,1.00,0.57,0.60,...,0,0,0,0,0,1,0,1,0,0
4,32PSGCK5PATHMR07,1,1,0.60,0.18,0.04,0.87,0.93,0.96,1.00,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,7SDJE48EFRPPNEJK,1,1,1.00,0.80,0.30,0.87,0.93,0.96,0.03,...,0,0,0,0,0,1,0,0,0,1
2496,PRT8RDNG6E86518P,0,1,0.07,0.00,0.94,0.47,0.79,0.95,0.00,...,0,0,0,1,0,0,0,0,1,0
2497,SRNIBIK27BQ2M3PB,0,0,0.00,0.52,0.71,0.96,1.00,0.82,0.60,...,0,0,0,1,0,0,1,0,0,0
2498,6CP232J9R8N84702,0,1,0.41,0.75,0.10,1.00,0.95,0.75,0.76,...,0,0,0,1,0,0,0,0,0,1


In [42]:
if generateTrain :
  df_train.to_pickle("train_cleaned.pkl")
  df_train.to_csv("train_cleaned.csv")

if generateTest :
  df_train.to_pickle("test_cleaned.pkl")
  df_train.to_csv("test_cleaned.csv")