In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Load Data
data=pd.read_csv('/content/adult - adult.csv')
data.head(10)

In [None]:
data.replace('?',np.nan,inplace=True)
data.head(10)

In [None]:
data.isnull().sum()
data.shape

In [None]:
data.dropna(how='any',inplace=True)
data.shape

In [None]:
data['income'] = data['income'].map({'<=50K': 0, '>50K': 1}).astype(int)
data.drop(['education'], axis = 1, inplace = True)

In [None]:
#Plotting
plt.figure(figsize = (10,6))
plt.title("Income of people according to their workclass", fontsize = 16)
sns.countplot(y = data['workclass'], hue = data['income'])
plt.show()

In [None]:
occupation_enc = (data.groupby('occupation').size()) / len(data)
print(occupation_enc)

In [None]:
data['occupation_enc'] = data['occupation'].apply(lambda x : occupation_enc[x])
data['occupation_enc'].head(3)
workclass_enc = (data.groupby('workclass').size()) / len(data)
print(workclass_enc)

In [None]:
data['workclass_enc'] = data['workclass'].apply(lambda x : workclass_enc[x])
data['workclass_enc'].head(3)

In [None]:
data['native-country'].loc[data['native-country'] == 'United-States'] = 'usa'
data['native-country'].loc[data['native-country'] != 'usa'] = 'non_usa'
data['native-country'].value_counts()
data['country_enc'] = data['native-country'].map({'usa' : 1, 'non_usa' : 0})
data.drop(['native-country'], axis = 1, inplace = True)
data['sex_enc'] = data['gender'].map({'Male' : 1, 'Female' : 0})
data.drop(['gender'], axis = 1, inplace = True)
marital_status_enc = (data.groupby('marital-status').size()) / len(data)
print(marital_status_enc)
data['marital_status_enc'] = data['marital-status'].apply(lambda x : marital_status_enc[x])
data['marital_status_enc'].head(3)
data.drop(['marital-status'], axis = 1, inplace = True)
race_enc = (data.groupby('race').size()) / len(data)
print(race_enc,'\n')
data['race_enc'] = data['race'].apply(lambda x : race_enc[x])

In [None]:
relationship_enc = (data.groupby('relationship').size()) / len(data)
print(relationship_enc)
data['relationship_enc'] = data['relationship'].apply(lambda x : relationship_enc[x])
data.drop(['race', 'relationship'], axis = 1, inplace = True)
data.drop(['workclass','occupation'],axis=1,inplace=True)
new_ds = data.drop(['income'], axis = 1)
new_ds['income'] = data['income']
new_ds

In [None]:
plt.style.use('default')
plt.style.use('ggplot')
clist = ['fnlwgt','age','capital-gain','capital-loss','hours-per-week']
plt.figure(figsize = (12,6))
for i in range(0, len(clist)):
    plt.subplot(2,3, i+1)
    sns.boxplot(data[clist[i]], color = 'skyblue')
print("BoxPlots of the features:")
plt.show()
from scipy.stats import zscore
zabs = np.abs(zscore(new_ds.loc[:,'fnlwgt':'hours-per-week']))
print(np.shape(np.where(zabs >= 3)))
new_ds = new_ds[(zabs < 3).all(axis = 1)]
new_ds

In [None]:
plt.figure(figsize = (14, 8))
plt.title("Correlation between target and features:")
sns.heatmap(new_ds.corr(), annot = True)
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()
new_ds.loc[:,'age':'hours-per-week'] = scale.fit_transform(new_ds.loc[:,'age':'hours-per-week'])
new_ds

In [None]:
plt.figure(figsize = (8, 4))
plt.title("Values distribution in target class: Income")
sns.countplot(data = new_ds, x = 'income')
plt.show()

In [None]:
from imblearn.combine import SMOTETomek
x = new_ds.loc[:,"age":"relationship_enc"]
y = new_ds.loc[:,"income"]
smk = SMOTETomek()
x_new, y_new = smk.fit_resample(x, y)

In [None]:
plt.figure(figsize = (8, 4))
plt.title("Values in target class after using SMOTETomek")
sns.countplot(x = y_new)
plt.show()

In [None]:
new_ds.head()

In [None]:
X =new_ds.iloc[:, :-1].values
y = new_ds.iloc[:, -1].values

In [None]:
#Split Train Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
cf=classifier.fit(X_train, y_train)
cf.intercept_

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
cf.coef_

In [None]:
cf.predict_proba(X)
cf.score(X,y)

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)