# Data Inputs and Display Libraries



In [None]:

import pandas as pd
import numpy as np

pd.set_option('display.float_format', lambda x: '%.5f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


# EDA Libraries

# Data Preprocessing Libraries

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder


# Feature Selection & Modeling Libraries

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile, f_classif
import pickle


In [None]:
# Accessing the data
!wget "https://github.com/univai-ghf/ghfmedia/raw/main/data/Trees_and_Ensembles/datasets.rar" 
!wget "https://github.com/univai-ghf/ghfmedia/raw/main/data/Trees_and_Ensembles/prep_file.rar" 
!wget "https://github.com/univai-ghf/ghfmedia/raw/main/data/Trees_and_Ensembles/num_cols.csv" 
!wget "https://github.com/univai-ghf/ghfmedia/raw/main/data/Trees_and_Ensembles/str_cols.csv" 

#unziping the rar
!unrar x './datasets.rar'
!unrar x './prep_file.rar'

--2022-03-10 06:25:06--  https://github.com/univai-ghf/ghfmedia/raw/main/data/Trees_and_Ensembles/datasets.rar
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/univai-ghf/ghfmedia/main/data/Trees_and_Ensembles/datasets.rar [following]
--2022-03-10 06:25:07--  https://raw.githubusercontent.com/univai-ghf/ghfmedia/main/data/Trees_and_Ensembles/datasets.rar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3508143 (3.3M) [application/octet-stream]
Saving to: ‘datasets.rar’


2022-03-10 06:25:07 (31.3 MB/s) - ‘datasets.rar’ saved [3508143/3508143]

--2022-03-10 06:25:07--  https://github.com/univai-ghf

In [None]:
#t1 = pd.read_csv("./prep_file.csv",sep=",")

t1 = pd.read_csv("./prep_file.csv")

# Step 1-  Look at the data

In [None]:
t1.head()

Unnamed: 0.1,Unnamed: 0,income,age,experience,bureau_score,married,house_ownership,car_ownership,risk_flag,profession,city,state,current_job_years,current_house_years,device
0,19607,2514921,31.0,4.0,651.0,single,rented,no,0,Psychologist,Chandrapur,Maharashtra,4.0,14.0,Oppo
1,75516,7047674,28.0,4.0,526.0,single,rented,yes,0,Economist,Ramagundam[27],Telangana,3.0,13.0,Xiaomi
2,63804,2749317,30.0,2.0,526.0,single,rented,no,0,Secretary,Ramagundam[27],Telangana,2.0,14.0,samsung
3,63676,7378274,24.0,0.0,764.0,single,rented,no,0,Flight attendant,Adoni,Andhra Pradesh,0.0,11.0,samsung
4,50914,9574585,27.0,5.0,739.0,single,rented,yes,0,Technician,Imphal,Manipur,5.0,10.0,Vivo


In [None]:
t1.columns

Index(['Unnamed: 0', 'income', 'age', 'experience', 'bureau_score', 'married',
       'house_ownership', 'car_ownership', 'risk_flag', 'profession', 'city',
       'state', 'current_job_years', 'current_house_years', 'device'],
      dtype='object')

# Look at data - now to get the target variable distribution

In [None]:
t1["risk_flag"].value_counts()

0    236567
1     43433
Name: risk_flag, dtype: int64

In [None]:
t1["risk_flag"].value_counts(normalize=True)

0   0.84488
1   0.15512
Name: risk_flag, dtype: float64

# Look at data - listing string and numeric columns

In [None]:
#String Columns
str_col_name_df = pd.read_csv("./str_cols.csv")
str_col_name_df.columns = ["index","col_name"]
str_col_name_list =list(str_col_name_df["col_name"])

#Numeric Columns
num_col_name_df = pd.read_csv("./num_cols.csv")
num_col_name_df.columns = ["index","col_name"]
num_col_name_df=num_col_name_df.reset_index()
num_col_name_list = list(num_col_name_df["col_name"])

In [None]:
print (str_col_name_list)
print (num_col_name_list)

['profession', 'married', 'house_ownership', 'car_ownership', 'city', 'state']
['income', 'age', 'experience', 'current_job_years', 'current_house_years', 'bureau_score']


In [None]:
t0 = t1.copy()
for i in str_col_name_list:
    t1[i] = t1[i].str.lower().str.lstrip().str.rstrip()
    t1[i] = t1[i].str.replace("[^a-z\s]+","")


  after removing the cwd from sys.path.


# Train-Test Split
### Before we do any preprocessing we want to keep train and test separate

In [None]:
x_train, x_test, y_train, y_test = train_test_split(t1, t1["risk_flag"], test_size=0.33, random_state=42)

In [None]:
x_train0 = x_train.reset_index()
x_test0 = x_test.reset_index()

In [None]:
x_train0.shape, x_test0.shape

((187600, 16), (92400, 16))

#One-Hot Encoding Categorical Features and Label Encoding Target Variable -- baseline categorical approach

In [None]:


enc = OneHotEncoder(handle_unknown='ignore')
df_one_hot_tr = pd.DataFrame(enc.fit_transform(np.array(x_train0[str_col_name_list])).todense())
df_one_hot_te = pd.DataFrame(enc.transform(np.array(x_test0[str_col_name_list])).todense())
colnames = enc.get_feature_names()

df_one_hot_tr.columns = colnames
df_one_hot_te.columns = colnames



In [None]:
viz1 = df_one_hot_tr.head()
viz1 = viz1.astype(int)
viz1.columns = colnames
viz1

Unnamed: 0,x0_air traffic controller,x0_analyst,x0_architect,x0_army officer,x0_artist,x0_aviator,x0_biomedical engineer,x0_chartered accountant,x0_chef,x0_chemical engineer,...,x5_puducherry,x5_punjab,x5_rajasthan,x5_sikkim,x5_tamil nadu,x5_telangana,x5_tripura,x5_uttar pradesh,x5_uttarakhand,x5_west bengal
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(x_train0[num_col_name_list])

x_train_num = pd.DataFrame(imp_mean.transform(x_train0[num_col_name_list]))
x_test_num = pd.DataFrame(imp_mean.transform(x_test0[num_col_name_list]))

x_train_num.columns = num_col_name_list
x_test_num.columns = num_col_name_list

SimpleImputer()

# Concatenating Numeric and Categorical Features

In [None]:
df_all_train1 = pd.concat([df_one_hot_tr,x_train_num],axis=1)
df_all_test1 = pd.concat([df_one_hot_te,x_test_num],axis=1)

In [None]:
all_cols = list(colnames) + num_col_name_list

In [None]:
##back to presentation

In [None]:

le = preprocessing.LabelEncoder()
y_train1 = le.fit_transform(y_train)
y_test1 = le.transform(y_test)

:

In [None]:
#Feature selection

selector = SelectPercentile(f_classif, percentile=60)
selector.fit(df_all_train1, y_train1)
df_all_train2 = selector.transform(df_all_train1)
df_all_test2 = selector.transform(df_all_test1)

SelectPercentile(percentile=60)

In [None]:
df_all_train1.shape, df_all_train2.shape

((187600, 408), (187600, 245))

In [None]:
columns_flag = selector.get_support()
#sel_cols = all_cols[columns_flag==True]
sel_cols =[]
for num,i in enumerate(all_cols):
    if (columns_flag[num]==True):
        sel_cols.append(i)

In [None]:
sel_cols

['x0_air traffic controller',
 'x0_analyst',
 'x0_army officer',
 'x0_artist',
 'x0_chef',
 'x0_civil engineer',
 'x0_comedian',
 'x0_computer operator',
 'x0_dentist',
 'x0_design engineer',
 'x0_drafter',
 'x0_economist',
 'x0_firefighter',
 'x0_flight attendant',
 'x0_graphic designer',
 'x0_industrial engineer',
 'x0_lawyer',
 'x0_librarian',
 'x0_magistrate',
 'x0_mechanical engineer',
 'x0_petroleum engineer',
 'x0_physician',
 'x0_police officer',
 'x0_politician',
 'x0_psychologist',
 'x0_scientist',
 'x0_secretary',
 'x0_surveyor',
 'x0_technical writer',
 'x0_technician',
 'x0_web designer',
 'x1_married',
 'x1_single',
 'x2_norentnoown',
 'x2_owned',
 'x2_rented',
 'x3_no',
 'x3_yes',
 'x4_agartala',
 'x4_agra',
 'x4_aizawl',
 'x4_akola',
 'x4_alappuzha',
 'x4_allahabad',
 'x4_alwar',
 'x4_ambala',
 'x4_amravati',
 'x4_anantapuram',
 'x4_arrah',
 'x4_asansol',
 'x4_aurangabad',
 'x4_avadi',
 'x4_ballia',
 'x4_barasat',
 'x4_bathinda',
 'x4_begusarai',
 'x4_belgaum',
 'x4_bel

In [None]:
###to understand not to run

In [None]:
def pik_now(ob_name):
    fl_out1 = ob_name
    pickling_on = open(fl_out1,"wb")
    pickle.dump(eval(ob_name), pickling_on)
    pickling_on.close()
    
    return

In [None]:
list_objs = ["df_all_train2","y_train1","df_all_test2","y_test1"]

for i in list_objs:
     pik_now(i)