## CLASSIFYING PERSONAL INCOME

In [None]:
############### IMPORTING NECCESSARY LIBRARIES ##############################

# To work with dataframes
import pandas as pd

# To perform numerical operations
import numpy as np

# To visualize data
import seaborn as sns

# To partition the data
from sklearn.model_selection import train_test_split

# Importing library for logistic regression
from sklearn.linear_model import LogisticRegression

# Importing performance metrics - accuracy score & confusion matrix
from sklearn.metrics import accuracy_score,confusion_matrix


In [None]:
# importing data

data_income = pd.read_csv(r"C:\Users\teesh\Downloads\income(1).csv")

In [None]:
data_income.head()

In [None]:
# creating copy of original data
data = data_income.copy()

## EXPLORATORY DATA ANALYSIS

### 1. getting to know data
### 2. Data preprocesing (Missing Values)
### 3. Cross tables and data visualization

In [None]:
# knowing given data
# check variables data type
print(data.info())

In [None]:
data.isnull()

In [None]:
data.isnull().sum()

In [None]:
# summary of numerical variables  (descriptive statistics)
summary_num = data.describe()
print(summary_num)

#### count  = count of observation under variable
#### mean = avg age is 38 yrs
#### std = 14
#### min age =17
#### 25% people age less than 28 yrs
#### 50% people age less than 37yrs
#### 75% people age less than 48yrs
#### max age is 90yrs

In [None]:
# summary of categorical variables
summary_cat = data.describe(include = 'object')
print(summary_cat)

In [None]:
# frequency of each categories under one column
data["JobType"].value_counts()

In [None]:
data["occupation"].value_counts()

In [None]:
print(np.unique(data["JobType"]))

In [None]:
print(np.unique(data['occupation']))

In [None]:
data = pd.read_csv(r"C:\Users\teesh\Downloads\income(1).csv",na_values=[" ?"])

In [None]:
data["JobType"].value_counts()

In [None]:
data["occupation"].value_counts()

In [None]:
data.isnull().sum()

In [None]:
missing = data[data.isnull().any(axis=1)]

In [None]:
display(missing)

In [None]:
missing["occupation"].value_counts()

In [None]:
missing["JobType"].value_counts()

### OUTCOMES
all entries in "occupation" column is NAN
total entries in JobType = 1816
entries with JobType being Never-Worked = 7

entries where JobType is Never-Worked and occupation is NAN = 7
remaining entries = 1816 - 7 = 1809

### In this case we'll remove the rows with missing places

In [None]:
data2 = data.dropna(axis=0)

In [None]:
data2

In [None]:
# relationship between independent variables
correlation = data2.corr()

In [None]:
print(correlation)

### Outcome
none of the values are nearer to 1. most of the values are nearer to 0 which means none of the variables are correlated to each other . if values closer to 1 it means those variables are strongly correlated to each other in positive manner and if values closer to -1 it means the variables are strongly related in negative manner. if closer to 0 it means no correlation between variables .

In [None]:
# cross tables and data visualization
data2.columns

In [None]:
# gender proportion
gender = pd.crosstab(index = data2["gender"],columns='count',normalize=True)
print(gender)

In [None]:
# gender vs salary status
gender_salstat = pd.crosstab(index = data2["gender"],columns=data2["SalStat"],normalize='index',margins=True)
print(gender_salstat)

### under classification problems we need to know how balanced our class values are . for this we use plots to determine the salary status

In [None]:
# frequency distribution of Salary status
salstat = sns.countplot(data2["SalStat"])
salstat

## 75% salary corresponds to less than 50,000 and 25% salary corresponds to greater than 50,000

In [None]:
# Histogram for age 
sns.distplot(data2["age"],kde=False,bins=10)

### people with age 20-45 are high

In [None]:
# age vs salstat
sns.boxplot("SalStat","age",data=data2)
data2.groupby("SalStat")["age"].median()

## people with 35-50 age are more likely to earn > 50,000 and people with 25-45 age are likely to earn <=50,000

In [None]:
# exploratory data analysis
# jobType vs SalStat
sns.countplot(y="JobType" , data=data2 , hue="SalStat")

In [None]:
pd.crosstab(index = data2["JobType"] , columns=data2["SalStat"],normalize='index')

## JobType important variable in avoiding the misuse of subsides

In [None]:
sns.countplot(y='EdType',data=data2,hue="SalStat")

In [None]:
pd.crosstab(index = data2["EdType"],columns=data2["SalStat"],normalize='index')

### those who have done doctorate , masters and Prof-School are more likely to earn >50,000 thus , EdType is an important variable

In [None]:
sns.countplot(y='occupation',data=data2,hue='SalStat')

In [None]:
pd.crosstab(index=data2['occupation'],columns=data2['SalStat'],normalize='index')

### Exec-managerial and Prof-specialty make more than 50,000 per year thus, occupation is an important variable to avoid misuse .

In [None]:
sns.distplot(data2["capitalgain"],bins=[0,20000,40000,60000,80000],kde=False)

In [None]:
sns.distplot(data2["capitalloss"],bins=[0,1000,2000,3000,4000],kde=False)

In [None]:
sns.boxplot(y="hoursperweek",x="SalStat",data=data2)

## Logistic Regression

In [None]:
data3 = data2.copy()

In [None]:
data4 = data2.copy()

In [None]:
# reindexing the salary status names to 0,1
data3["SalStat"] = data3["SalStat"].map({' less than or equal to 50,000': 0,' greater than 50,000': 1})

In [None]:
data3

## integer encoding = categorical into numerical

In [None]:
new_data = pd.get_dummies(data3, drop_first=True)
# categorical variables into dummies one-hot encoding splitting coloumn categorical data to many columns depending on number of categories present in columns

In [None]:
print(np.unique(data3["JobType"]))

In [None]:
new_data

In [None]:
# storing the columns name
columns_list=list(new_data.columns)
print(columns_list)

In [None]:
# seperating the input names from data
features = list(set(columns_list)-set(["SalStat"]))
print(features)

In [None]:
# storing the output values in y
y = new_data["SalStat"].values
print(y)
y.shape

In [None]:
# storing the values from input features
x = new_data[features].values
print(x)
x.shape

In [None]:
# splitting the data into train and test
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size = 0.3, random_state=0)
# x input y output test_size => proportion of data include in test split random_state=> same set of sample to be chosen for analyses 

In [None]:
print(test_x.shape)
print(test_y.shape)
print(train_x.shape)
print(train_y.shape)

In [None]:
train_y

In [None]:
# make an instance of model
logistic = LogisticRegression()

In [None]:
from sklearn import linear_model
logistic = linear_model.LogisticRegression(solver='lbfgs',max_iter=100000)

In [None]:
# fitting values for x and y
logistic.fit(train_x,train_y)

In [None]:
logistic.coef_

In [None]:
logistic.intercept_

In [None]:
# prediction from test data
prediction = logistic.predict(test_x)

In [None]:
print(prediction)

### Model evaluation

In [None]:
# confusion matrix table use to evaluate the performance of classification problem 
# give number of correct prediction and number of incorrect prediction 
## diagonal sum represent number of correct predictions 
## opposite diagonal sum give number of incorrect predictions
confusion_matrix = confusion_matrix(test_y,prediction)

In [None]:
print(confusion_matrix)

In [None]:
# calculating accuracy 
accuracy_score1 = accuracy_score(test_y,prediction)

In [None]:
print(accuracy_score1)

In [None]:
# missclassified values from prediction
print("Missclassified samples: %d" % (test_y != prediction).sum())

In [None]:
# logistic Regression - removing insignificant variables
data4["SalStat"] = data4["SalStat"].map({' less than or equal to 50,000': 0,' greater than 50,000':1})

In [None]:
data4

In [None]:
cols = ['gender','nativecountry','race','JobType']
new_data2 = data4.drop(cols,axis=1)

In [None]:
new_data2 = pd.get_dummies(new_data2,drop_first=True)

In [None]:
column_list = list(new_data2.columns)
print(column_list)

In [None]:
features2 = list(set(column_list)-set(["SalStat"]))

In [None]:
y=new_data2["SalStat"].values

In [None]:
x=new_data2[features2].values

In [None]:
train_x1,test_x1,train_y1,test_y1 = train_test_split(x,y,test_size=0.3,random_state=0)

In [None]:
logistic1 = LogisticRegression()

In [None]:
logistic1.fit(train_x1,train_y1)

In [None]:
from sklearn import linear_model
logistic1 = linear_model.LogisticRegression(solver='lbfgs',max_iter=100000)

In [None]:
logistic1.fit(train_x1,train_y1)

In [None]:
prediction1 = logistic1.predict(test_x1)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [None]:
accuracy_score2 = accuracy_score(test_y1,prediction1)

In [None]:
print(accuracy_score2)

In [None]:
confusion_matrix = confusion_matrix(test_y1,prediction1)

In [None]:
print(confusion_matrix)

In [None]:
print("Missclassified values %d" % (test_y1!=prediction1).sum())

### KNN classifire model to classify records in any one of the categories of salary status

In [None]:
# importing the library of KNN
from sklearn.neighbors import KNeighborsClassifier

# import library for plotting
import matplotlib.pyplot as plt

In [None]:
# storing the K nearest neighbors classifier
KNN_classifier = KNeighborsClassifier(n_neighbors = 5)
# considers 5 neighbors when classifying data into <=50,00 or >50,000
# 5 neighbors => majority classes form 5 neighborsand then classify new data based on majority voting method  

In [None]:
KNN_classifier.fit(train_x,train_y)

In [None]:
prediction = KNN_classifier.predict(test_x)

In [None]:
prediction

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix 

In [None]:
confusion_matrix = confusion_matrix(test_y,prediction)

In [None]:
confusion_matrix

In [None]:
accuracy_score = accuracy_score(test_y,prediction)

In [None]:
accuracy_score

In [None]:
print("Misclassified samples %d" % (test_y != prediction).sum())

## Effect of K value on classifier

In [None]:
Misclassified_samples = []

In [None]:
# calculating error for K values between 1 and 20
for i in range(1,20):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(train_x,train_y)
    pred_i = knn.predict(test_x)
    Misclassified_samples.append((test_y != pred_i).sum())

In [None]:
print(Misclassified_samples)

# Deployment

In [None]:
import warnings
import pickle

In [None]:
pickle.dump(logistic,open('classify_model.pkl','wb'))
modelclassify = pickle.load(open('classify_model.pkl','rb'))