In [76]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import LabelEncoder

In [77]:
df_train = pd.read_csv("train.csv")
df_train_copy = df_train.copy()

In [78]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 24 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Id                                           7500 non-null   object 
 1   User Name                                    7500 non-null   object 
 2   Personal URL                                 3256 non-null   object 
 3   Profile Cover Image Status                   7410 non-null   object 
 4   Profile Verification Status                  7500 non-null   object 
 5   Profile Text Color                           7434 non-null   object 
 6   Profile Page Color                           7422 non-null   object 
 7   Profile Theme Color                          7428 non-null   object 
 8   Is Profile View Size Customized?             7500 non-null   bool   
 9   UTC Offset                                   7014 non-null   float64
 10  

**Dropping classes**

In [79]:
df_train.drop(labels=["User Name", "Personal URL", "Location", "User Time Zone"], axis=1, inplace=True)

**Managing TimeStamp**

In [80]:
trips_start = df_train[["Id", "Profile Creation Timestamp"]].copy()
trips_start["Profile Creation Timestamp"] = pd.to_datetime(trips_start["Profile Creation Timestamp"])

#https://www.w3resource.com/python-exercises/pandas/datetime/pandas-datetime-exercise-8.php
trips_start["Profile Creation Day"] = trips_start["Profile Creation Timestamp"].dt.day
trips_start["Profile Creation WeekDay"] = trips_start["Profile Creation Timestamp"].dt.day_name()
trips_start["Profile Creation Month"] = trips_start["Profile Creation Timestamp"].dt.month
trips_start["Profile Creation Year"] = trips_start["Profile Creation Timestamp"].dt.year
trips_start["Profile Creation Hour"] = trips_start["Profile Creation Timestamp"].dt.hour
trips_start = trips_start.drop(columns=["Profile Creation Timestamp"])

df_train = df_train.join(trips_start.set_index('Id'), on='Id')

#Here, this is an totally optional step. I moved the 3 features added to the dataframe to the front of the dataframe to make it easy to see them when printing
#https://stackoverflow.com/questions/25122099/move-column-by-name-to-front-of-table-in-pandas
for col_name in ["Profile Creation Hour", "Profile Creation WeekDay", "Profile Creation Day", "Profile Creation Month", "Profile Creation Year"]:
  mid = df_train[col_name]
  df_train.drop(labels=col_name, axis=1, inplace = True)
  df_train.insert(10, col_name, mid)

df_train.drop(columns=["Profile Creation Timestamp"], inplace=True)

**Converting Categories to numeric**

In [81]:
cover_encoder = LabelEncoder()
df_train['Profile Cover Image Status'] = df_train['Profile Cover Image Status'].replace(np.nan, "unknown", regex=True)
df_train['Profile Cover Image Status'] = cover_encoder.fit_transform(df_train['Profile Cover Image Status'])
print(cover_encoder.classes_)

verification_encoder = LabelEncoder()
df_train['Profile Verification Status'] = verification_encoder.fit_transform(df_train['Profile Verification Status'])
print(verification_encoder.classes_)


'''
text_color_encoder = LabelEncoder()
df_train['Profile Text Color'] = df_train['Profile Text Color'].replace(np.nan, 'none', regex=True)
df_train['Profile Text Color'].apply(lambda x: x if(len(str(x)) == 6) else 'none')
df_train['Profile Text Color'] = text_color_encoder.fit_transform(df_train['Profile Text Color'])
print( text_color_encoder.classes_)

page_color_encoder = LabelEncoder()
df_train["Profile Page Color"] = df_train['Profile Page Color'].replace(np.nan, 'none', regex=True)
df_train['Profile Page Color'].apply(lambda x: x if(len(str(x)) == 6) else "")
df_train["Profile Page Color"] = page_color_encoder.fit_transform(df_train["Profile Page Color"])
print( page_color_encoder.classes_)

theme_color_encoder = LabelEncoder()
df_train["Profile Theme Color"] = df_train['Profile Theme Color'].replace(np.nan, 'none', regex=True)
df_train['Profile Theme Color'].apply(lambda x: x if(len(str(x)) == 6) else "")
df_train["Profile Theme Color"] = theme_color_encoder.fit_transform(df_train["Profile Theme Color"])
print( theme_color_encoder.classes_)
'''

location_encoder = LabelEncoder()
df_train["Location Public Visibility"] = df_train['Location Public Visibility'].replace(np.nan, '??', regex=True)
df_train['Location Public Visibility'] = df_train['Location Public Visibility'].str.lower()
df_train["Location Public Visibility"] = location_encoder.fit_transform(df_train["Location Public Visibility"])
print( location_encoder.classes_)

view_size_encoder = LabelEncoder()
df_train["Is Profile View Size Customized?"] = view_size_encoder.fit_transform(df_train["Is Profile View Size Customized?"])
print( view_size_encoder.classes_)

language_encoder = LabelEncoder()
df_train["User Language"] = df_train['User Language'].str.lower()
df_train["User Language"] = language_encoder.fit_transform(df_train["User Language"])
print( language_encoder.classes_)

category_encoder = LabelEncoder()
df_train["Profile Category"] = df_train['Profile Category'].replace(np.nan, 'unknown', regex=True)
df_train["Profile Category"] = df_train['Profile Category'].replace(' ', 'unknown', regex=True)
df_train['Profile Category'] = df_train['Profile Category'].str.lower()
df_train["Profile Category"] = category_encoder.fit_transform(df_train["Profile Category"])
print( category_encoder.classes_)

df_train["Profile Creation WeekDay"] = df_train["Profile Creation WeekDay"].map({'Monday':0, 'Tuesday':1, 'Wenesday':2, 'Thursday':3, "Friday":4, "Saturday":5, "Sunday":6})
in_order_weekdays = ["Monday", "Tuesday", "Wenesday", "Thursday", "Friday", "Saturday", "Sunday"]
print(in_order_weekdays)



['Not set' 'Set' 'unknown']
['Not verified' 'Pending' 'Verified']
['??' 'disabled' 'enabled']
[False  True]
['ar' 'ca' 'cs' 'da' 'de' 'el' 'en' 'en-gb' 'es' 'fi' 'fr' 'hu' 'id' 'it'
 'ja' 'ko' 'nl' 'pl' 'pt' 'ru' 'sk' 'sr' 'sv' 'th' 'tr' 'uk' 'zh-cn'
 'zh-tw']
['business' 'celebrity' 'government' 'unknown']
['Monday', 'Tuesday', 'Wenesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']


**Change unkown values in columns to NAN**


In [82]:
df_train["Profile Cover Image Status"].replace(2, np.nan, inplace=True)
'''
df_train["Profile Text Color"].replace(len(text_color_encoder.classes_)-1, np.nan, inplace=True)
df_train["Profile Page Color"].replace(len(page_color_encoder.classes_)-1, np.nan, inplace=True)
df_train["Profile Theme Color"].replace(len(theme_color_encoder.classes_)-1, np.nan, inplace=True)
'''
df_train["Location Public Visibility"].replace(0, np.nan, inplace=True)


**Rounding Daily Profile Visits and Clicks**

In [83]:
df_train["Avg Daily Profile Visit Duration in seconds"] = df_train["Avg Daily Profile Visit Duration in seconds"].round()
df_train["Avg Daily Profile Clicks"] = df_train["Avg Daily Profile Clicks"].round()

**Filling NAN values with appropriate replacement (Median)**

In [84]:
df_train = df_train.fillna(df_train.median())

**Convert Columns to appropriate type**

In [85]:

df_train.astype({'Profile Cover Image Status': 'int64', 
                 'UTC Offset': 'int64',
                 'Location Public Visibility': 'int64',
                 'Profile Creation WeekDay': 'int64',
                 'Avg Daily Profile Visit Duration in seconds': 'int64',
                 'Avg Daily Profile Clicks': 'int64'})


Unnamed: 0,Id,Profile Cover Image Status,Profile Verification Status,Profile Text Color,Profile Page Color,Profile Theme Color,Is Profile View Size Customized?,UTC Offset,Location Public Visibility,User Language,Profile Creation Year,Profile Creation Month,Profile Creation Day,Profile Creation WeekDay,Profile Creation Hour,Num of Followers,Num of People Following,Num of Status Updates,Num of Direct Messages,Profile Category,Avg Daily Profile Visit Duration in seconds,Avg Daily Profile Clicks,Profile Image,Num of Profile Likes
0,AL85S14OMDPF01I9,1,2,db1a2c,eaf0f2,e70409,0,39600,2,6,2008,11,27,3,5,95763,4289,30809,873,0,15,2,AL85S14OMDPF01I9.png,2815
1,HI11QOPD7BLJTO7Q,1,2,0099cc,f6ffd1,fff04d,0,-14400,2,6,2010,1,15,4,18,1018746,289,8150,290,3,8,11,HI11QOPD7BLJTO7Q.png,1242
2,JS49LP5P72RI1OQB,1,0,1fc2de,efefef,1fc2de,0,-18000,2,6,2009,10,2,4,20,13444,1876,4698,227,3,32,1,JS49LP5P72RI1OQB.png,1559
3,S0GDSC09MACCLBJP,0,2,050000,616161,00090a,0,-14400,2,6,2009,2,19,3,14,339168,1148,53216,4035,0,23,4,S0GDSC09MACCLBJP.png,6342
4,CRSEMK4QER6LDJSA,1,0,58424d,f7f7f7,000000,0,-18000,2,6,2009,3,31,1,13,9215,93,3271,130,3,8,4,CRSEMK4QER6LDJSA.png,1078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,KP5DR7CTN8CNLG8P,0,2,490707,ddeef6,000000,0,-25200,1,6,2011,1,20,3,20,6086,76,985,56,3,25,8,KP5DR7CTN8CNLG8P.png,1373
7496,SGNF9P1P3922T34T,0,0,000000,ffffff,ffffff,0,-10800,2,18,2011,3,16,3,3,280190,38604,20652,172,3,26,1,SGNF9P1P3922T34T.png,1407
7497,O00FTLDT6O803LD0,1,2,0084b4,ddffcc,00133b,0,7200,2,6,2009,3,14,5,16,21647,49,2348,271,3,15,3,O00FTLDT6O803LD0.png,1235
7498,9F04JLR9JGT8R3EJ,1,0,91d2fa,ddffcc,fa743e,0,-10800,2,18,2009,7,25,5,16,89005,256,73830,346,0,12,2,9F04JLR9JGT8R3EJ.png,1105


**Saving Model to CSV and Pkl**

In [86]:
df_train

Unnamed: 0,Id,Profile Cover Image Status,Profile Verification Status,Profile Text Color,Profile Page Color,Profile Theme Color,Is Profile View Size Customized?,UTC Offset,Location Public Visibility,User Language,Profile Creation Year,Profile Creation Month,Profile Creation Day,Profile Creation WeekDay,Profile Creation Hour,Num of Followers,Num of People Following,Num of Status Updates,Num of Direct Messages,Profile Category,Avg Daily Profile Visit Duration in seconds,Avg Daily Profile Clicks,Profile Image,Num of Profile Likes
0,AL85S14OMDPF01I9,1.0,2,db1a2c,eaf0f2,e70409,0,39600.0,2.0,6,2008,11,27,3.0,5,95763,4289,30809,873,0,15.0,2.0,AL85S14OMDPF01I9.png,2815
1,HI11QOPD7BLJTO7Q,1.0,2,0099cc,f6ffd1,fff04d,0,-14400.0,2.0,6,2010,1,15,4.0,18,1018746,289,8150,290,3,8.0,11.0,HI11QOPD7BLJTO7Q.png,1242
2,JS49LP5P72RI1OQB,1.0,0,1fc2de,efefef,1fc2de,0,-18000.0,2.0,6,2009,10,2,4.0,20,13444,1876,4698,227,3,32.0,1.0,JS49LP5P72RI1OQB.png,1559
3,S0GDSC09MACCLBJP,0.0,2,050000,616161,00090a,0,-14400.0,2.0,6,2009,2,19,3.0,14,339168,1148,53216,4035,0,23.0,4.0,S0GDSC09MACCLBJP.png,6342
4,CRSEMK4QER6LDJSA,1.0,0,58424d,f7f7f7,000000,0,-18000.0,2.0,6,2009,3,31,1.0,13,9215,93,3271,130,3,8.0,4.0,CRSEMK4QER6LDJSA.png,1078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,KP5DR7CTN8CNLG8P,0.0,2,490707,ddeef6,000000,0,-25200.0,1.0,6,2011,1,20,3.0,20,6086,76,985,56,3,25.0,8.0,KP5DR7CTN8CNLG8P.png,1373
7496,SGNF9P1P3922T34T,0.0,0,000000,ffffff,ffffff,0,-10800.0,2.0,18,2011,3,16,3.0,3,280190,38604,20652,172,3,26.0,1.0,SGNF9P1P3922T34T.png,1407
7497,O00FTLDT6O803LD0,1.0,2,0084b4,ddffcc,00133b,0,7200.0,2.0,6,2009,3,14,5.0,16,21647,49,2348,271,3,15.0,3.0,O00FTLDT6O803LD0.png,1235
7498,9F04JLR9JGT8R3EJ,1.0,0,91d2fa,ddffcc,fa743e,0,-10800.0,2.0,18,2009,7,25,5.0,16,89005,256,73830,346,0,12.0,2.0,9F04JLR9JGT8R3EJ.png,1105


In [87]:
df_train.to_pickle("train_cleaned.pkl")
df_train.to_csv("train_cleaned.csv")