In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Extracting features with high covariance

In [None]:
pd.options.display.max_columns = 150
df=pd.read_csv('../input/costa-rican-household-poverty-prediction/train.csv')
df.head()

In [None]:
df.columns

In [None]:
print('number of target variables ', df['Target'].unique())
print('number of data points ', len(df))
print('number of person data points' ,df['Id'].nunique())
print('number of unique identifier for each household',df['idhogar'].nunique())

idhogar represents the total number of house id's

In [None]:
import matplotlib.pyplot as plt
plt.hist(df['Target'])
plt.show()

Extracting important features based on correlation metric. We will only consider correlation between target variable and other parameters

In [None]:
df.info()

In [None]:
for column in df.columns:
    mode=df[column].mode()
    df[column]=df[column].fillna(mode)

df.isna().any()

In [None]:
cordf=df.drop(['Target'],axis=1)

# Create correlation matrix
corr_matrix = cordf.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
to_drop.append('Id')

# Drop features 
cordf.drop(to_drop, axis=1, inplace=True)

In [None]:
print('number of column after removing highly correlated features ',len(cordf.columns))

In [None]:
corr=df.corr().abs()
to_drop=[]
for column in cordf.columns:
    try:
        if(corr[column]['Target']<0.1):
            to_drop.append(column)
    except:
        print(column)
        to_drop.append(column)
cordf.drop(to_drop,axis=1,inplace=True)

In [None]:
print('number of column after removing highly correlated features ',len(cordf.columns))
cordf.columns

In [None]:
cordf.columns[cordf.isna().sum()!=0]

In [None]:
cordf['v2a1']=cordf['v2a1'].fillna(0)
cordf['SQBmeaned']=cordf['SQBmeaned'].fillna(0)
cordf['meaneduc']=cordf['meaneduc'].fillna(0)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(cordf,df['Target'])

In [None]:
clf.score(cordf,df['Target'])

In [None]:
def prepare_data(df):
    print('number of data points ', len(df))
    print('number of person data points' ,df['Id'].nunique())
    print('number of unique identifier for each household',df['idhogar'].nunique())
    
    for column in df.columns:
        mode=df[column].mode()
        df[column]=df[column].fillna(mode)
    
    df['v2a1']=df['v2a1'].fillna(0)
    df['SQBmeaned']=df['SQBmeaned'].fillna(0)
    df['meaneduc']=df['meaneduc'].fillna(0)

    
    return df

In [None]:
x_train=pd.read_csv('../input/costa-rican-household-poverty-prediction/train.csv')
y_train=x_train['Target']

x_test=pd.read_csv('../input/costa-rican-household-poverty-prediction/test.csv')

print('train data')
x_train=prepare_data(x_train)
print('test data')
x_test=prepare_data(x_test)

x_train.drop(['Target','Id','idhogar','dependency','edjefe','edjefa'],axis=1,inplace=True)
x_test.drop(['Id','idhogar','dependency','edjefe','edjefa'],axis=1,inplace=True)

In [None]:
x_train.info()

In [None]:
to_drop=x_train.columns[x_train.isna().any()].tolist()
x_train.drop(to_drop,axis=1,inplace=True)
x_test.drop(to_drop,axis=1,inplace=True)

In [None]:
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
x_train=scaler.fit_transform(x_train)

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x_train, y_train, test_size=0.33, random_state=42)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(xtrain,ytrain)
ypred=clf.predict(xtest)
print(clf.score(xtest,ytest))
print(accuracy_score(ytest,ypred))

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, random_state=0).fit(xtrain)
ypred=kmeans.predict(xtest)
print(accuracy_score(ytest,ypred))

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
ypred = gnb.fit(xtrain, ytrain).predict(xtest)
print(accuracy_score(ytest,ypred))

Feature enginnering steps and EDA since we did not consider only numerical data,we will further clean and process data to extract features

In [None]:
df=pd.read_csv('../input/costa-rican-household-poverty-prediction/train.csv')
df.columns[df.dtypes==object]

id - unique identifier
idhogar - unique identifer of head of household
so we can ignore thest two features


dependency- This is an important feature but it is not clean but we have sqbdependency which is clean and we can extarct dependency from sqbdependency

edjefe, edjefa - years of education of head of household

# Data preprocessing

In [None]:
df['dependency']=np.sqrt(df['SQBdependency'])

In [None]:
df.columns[df.isna().sum()!=0]

Columns with null values

v2a1 - monthly rent

v18q1 - number of tablets

rez_esc - years behind school

meaneduc - mean education for adults

SQBmeaned - square of meaned

meaneduc and sqbmeaned are hight correlated

In [None]:
print('total  number of rows with meaneduc as null values ',len(df[df['meaneduc'].isnull()]))
print('total  number of rows with v2a1 as null values ',len(df[df['v2a1'].isnull()]))
print('total  number of rows with v18q1 as null values ',len(df[df['v18q1'].isnull()]))
print('total  number of rows with rez_esc as null values ',len(df[df['rez_esc'].isnull()]))
print('total  number of rows with SQBmeaned as null values ',len(df[df['SQBmeaned'].isnull()]))

Handling v2a1 values as there are lot of null values, so we can see whther all these people own there house

In [None]:
norent=df[df['v2a1'].isnull()]
print("Owns his house:", norent[norent['tipovivi1']==1]['Id'].count())
print("Owns his house paying installments", norent[norent['tipovivi2']==1]['Id'].count())
print("Precarious ", norent[norent['tipovivi4']==1]['Id'].count())
print("Other ", norent[norent['tipovivi5']==1]['Id'].count())
print("Total ", 6860)

In [None]:
df['v2a1']=df['v2a1'].fillna(0)

Now lets look at v18q1- Number of tablets

In [None]:
print('total  number of rows with v18q1 as null values ',len(df[df['v18q1'].isnull()]))

we have v18q which indicates whteher there is a teblet in the household or not

In [None]:
tabletnan=df[df['v18q1'].isnull()]
tabletnan[tabletnan['v18q']==0]['Id'].count()

In [None]:
df['v18q1']=df['v18q1'].fillna(0)

lets look at rez_esc which indicates the number of years a person is behind school

In [None]:
print(df['rez_esc'].isnull().sum())
df['rez_esc']=df['rez_esc'].fillna(0)

In [None]:
df.columns[df.isna().sum()!=0]

In [None]:
meaned=df[df['meaneduc'].isnull()]
meaned[['meaneduc','SQBmeaned']]

In [None]:
df['meaneduc']=df['meaneduc'].fillna(0)
df.drop(['SQBmeaned'],axis=1,inplace=True)

In [None]:
df.columns[df.isna().sum()!=0]

# EDA on data

Most of the features are related to each other like :-

* v14a, =1 has bathroom in the household
* refrig, =1 if the household has refrigerator
* v18q, owns a tablet
* v18q1, number of tablets household owns

so we will explore only one feature of each category

v2a1 - Monthly rent payment 

In [None]:
plt.hist(df['v2a1'])

* tamhog, size of the household
* tamviv, number of persons living in the household

In [None]:
plt.plot(df['tamhog'],df['tamviv'])
print('as the size of household increases the nuumber of persons living in the house also increased')

* abastaguadentro, =1 if water provision inside the dwelling
* abastaguafuera, =1 if water provision outside the dwelling
* abastaguano, =1 if no water provision

In [None]:
print('water provision inside the dwelling ', df['abastaguadentro'].sum())
print('water provision outside the dwelling ', df['abastaguafuera'].sum())
print('no water provision ', df['abastaguano'].sum())

In [None]:
abastaguadentro=df[df['abastaguadentro']==1]
print('mean poverty level of households for water provision inside the dwelling ',abastaguadentro['Target'].mean())
plt.hist(abastaguadentro['Target'])
plt.show()

In [None]:
abastaguafuera=df[df['abastaguafuera']==1]
print('mean poverty level of households for water provision outside the dwelling ',abastaguafuera['Target'].mean())
plt.hist(abastaguafuera['Target'])
plt.show()

In [None]:
abastaguano=df[df['abastaguano']==1]
print('mean poverty level of households no water provision  ',abastaguano['Target'].mean())
plt.hist(abastaguano['Target'])
plt.show()

* public, "=1 electricity from CNFL,  ICE,  ESPH/JASEC"
* planpri, =1 electricity from private plant
* noelec, =1 no electricity in the dwelling
* coopele, =1 electricity from cooperative

In [None]:
print('electricity from CNFL, ICE, ESPH/JASEC', df['public'].sum())
print('electricity from private plant', df['planpri'].sum())
print('no electricity in the dwelling', df['noelec'].sum())
print('electricity from cooperative', df['coopele'].sum())

In [None]:
public=df[df['public']==1]
print('electricity from CNFL, ICE, ESPH/JASEC',public['Target'].mean())
plt.hist(public['Target'])
plt.show()

In [None]:
noelec=df[df['noelec']==1]
print('no electricity in the dwelling',noelec['Target'].mean())
plt.hist(noelec['Target'])
plt.show()

In [None]:
coopele=df[df['coopele']==1]
print('electricity from cooperative',coopele['Target'].mean())
plt.hist(coopele['Target'])
plt.show()

1. sanitario1, =1 no toilet in the dwelling
1. sanitario2, =1 toilet connected to sewer or cesspool
1. sanitario3, =1 toilet connected to  septic tank
1. sanitario5, =1 toilet connected to black hole 
1. sanitario6, =1 toilet connected to other system

In [None]:
print('no toilet in the dwelling', df['sanitario1'].sum())
print('toilet connected to sewer or cesspool', df['sanitario2'].sum())
print('toilet connected to septic tank', df['sanitario3'].sum())
print('toilet connected to black hole ', df['sanitario5'].sum())
print('toilet connected to other system', df['sanitario6'].sum())

In [None]:
sanitario1=df[df['sanitario1']==1]
print('no toilet in the dwelling',sanitario1['Target'].mean())
plt.hist(sanitario1['Target'])
plt.show()

In [None]:
sanitario2=df[df['sanitario2']==1]
print('toilet connected to sewer or cesspool',sanitario2['Target'].mean())
plt.hist(sanitario2['Target'])
plt.show()

In [None]:
sanitario3=df[df['sanitario3']==1]
print('toilet connected to septic tank',sanitario3['Target'].mean())
plt.hist(sanitario3['Target'])
plt.show()

In [None]:
sanitario5=df[df['sanitario5']==1]
print('toilet connected to black hole',sanitario5['Target'].mean())
plt.hist(sanitario5['Target'])
plt.show()

In [None]:
sanitario6=df[df['sanitario6']==1]
print('toilet connected to other system',sanitario6['Target'].mean())
plt.hist(sanitario6['Target'])
plt.show()

* energcocinar1, =1 no main source of energy used for cooking (no kitchen)
* energcocinar2, =1 main source of energy used for cooking electricity
* energcocinar3, =1 main source of energy used for cooking gas
* energcocinar4, =1 main source of energy used for cooking wood charcoal

In [None]:
print('no main source of energy used for cooking', df['energcocinar1'].sum())
print('main source of energy used for cooking electricity', df['energcocinar2'].sum())
print('main source of energy used for cooking gas', df['energcocinar3'].sum())
print('main source of energy used for cooking wood charcoal', df['energcocinar4'].sum())

In [None]:
energcocinar1=df[df['energcocinar1']==1]
print('no main source of energy used for cooking ',energcocinar1['Target'].mean())
plt.hist(energcocinar1['Target'])
plt.show()

In [None]:
energcocinar2=df[df['energcocinar2']==1]
print('main source of energy used for cooking electricity ',energcocinar2['Target'].mean())
plt.hist(energcocinar2['Target'])
plt.show()

In [None]:
energcocinar3=df[df['energcocinar3']==1]
print('main source of energy used for cooking gas',energcocinar3['Target'].mean())
plt.hist(energcocinar3['Target'])
plt.show()

In [None]:
energcocinar4=df[df['energcocinar4']==1]
print('main source of energy used for cooking wood charcoal',energcocinar4['Target'].mean())
plt.hist(energcocinar4['Target'])
plt.show()

* epared1, =1 if walls are bad
* epared2, =1 if walls are regular
* epared3, =1 if walls are good

In [None]:
def plot_graph(columns):
    fig,axis=plt.subplots(1,3,figsize=(15,5))
    for column in range(len(columns)):
        gdf=df[df[columns[column]]==1]
        print(columns[column]," ",gdf['Target'].mean())
        axis[column].hist(gdf['Target'])
    plt.show()

In [None]:
columns=['epared1','epared2','epared3']
plot_graph(columns)

* etecho1, =1 if roof are bad
* etecho2, =1 if roof are regular
* etecho3, =1 if roof are good

In [None]:
columns=['etecho1','etecho2','etecho3']
plot_graph(columns)

In [None]:
x_train=df.drop(['Id', 'idhogar', 'Target', 'edjefe', 'edjefa'], axis=1)
y_train=df['Target']

* lugar1, =1 region Central
* lugar2, =1 region Chorotega
* lugar3, =1 region PacÃƒÂ­fico central
* lugar4, =1 region Brunca
* lugar5, =1 region Huetar AtlÃƒÂ¡ntica
* lugar6, =1 region Huetar Norte
* area1, =1 zona urbana
* area2, =2 zona rural

In [None]:
columns=['lugar1','lugar2','lugar3']
plot_graph(columns)

In [None]:
columns=['lugar4','lugar5','lugar6']
plot_graph(columns)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
clf = RandomForestClassifier()
params={'n_estimators': list(range(40,61, 1))}
rfs = GridSearchCV(clf, params, cv=5)
rfs.fit(x_train, y_train)

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x_train, y_train, test_size=0.33, random_state=42)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(xtrain,ytrain)
ypred=clf.predict(xtest)
print(accuracy_score(ytest,ypred))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(ytest, ypred))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(ytest, ypred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
clf = RandomForestClassifier()
params={'n_estimators': list(range(40,61, 1))}
gs = GridSearchCV(clf, params, cv=5)
gs.fit(x_train, y_train)

In [None]:
ypred=gs.predict(xtest)

In [None]:
print(confusion_matrix(ytest, ypred))

In [None]:
print(classification_report(ytest, ypred))

In [None]:
test=pd.read_csv('../input/costa-rican-household-poverty-prediction/test.csv')
test.head()

In [None]:
test['dependency']=np.sqrt(test['SQBdependency'])
test['v2a1']=test['v2a1'].fillna(0)
test['v18q1']=test['v18q1'].fillna(0)
test.drop(['SQBmeaned'],axis=1,inplace=True)
test['rez_esc']=test['rez_esc'].fillna(0)

In [None]:
xtest=test.drop(['Id','idhogar','edjefe', 'edjefa'], axis=1)
xtest['meaneduc']=xtest['meaneduc'].fillna(0)

In [None]:
xtest.shape

In [None]:
xtest.columns[xtest.isna().sum()!=0]

In [None]:
ypred=rfs.predict(xtest)

In [None]:
from pandas import DataFrame
df = DataFrame ()

In [None]:
df['Id']=test['Id']
df['Target']=ypred.reshape(-1,)

In [None]:
df.head()

In [None]:
df.to_csv('submission.csv',index=False)