# Census Income Data Set

## Import and explore the data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("other.csv",na_values='?')
df.dropna(how='any',inplace=True)
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df["Output"] = labelencoder.fit_transform(df['Output'])
print(df.head())
print(df.info())

## Visualizing Data

In [None]:
f = plt.figure(figsize=(12, 10))
corr = df.corr()
plt.matshow(corr,fignum=f.number)
plt.xticks(range(len(corr.columns)),corr.columns, fontsize=10, rotation=45)
plt.yticks(range(len(corr.columns)),corr.columns, fontsize=10)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=10)
plt.title('Correlation Matrix', fontsize=16)


In [None]:
figure,axis = plt.subplots(13,1,figsize=(24,48))
figure.tight_layout(pad=10, w_pad=10, h_pad=10)
sns.histplot(x='Age',data=df,ax=axis[0])
sns.countplot(x='Work Class',data=df,ax=axis[1])
sns.countplot(x='Education',data=df,ax=axis[2])
sns.countplot(x='Education Num',data=df,ax=axis[3])
sns.countplot(x='Married',data=df,ax=axis[4])
sns.countplot(x='Occupation',data=df,ax=axis[5])
sns.countplot(x='Relationship',data=df,ax=axis[6])
sns.countplot(x='Race',data=df,ax=axis[7])
sns.countplot(x='Sex',data=df,ax=axis[8])
sns.histplot(x='CapitalGain',data=df,ax=axis[9])
sns.histplot(x='CapitalLoss',data=df,ax=axis[10])
sns.histplot(x='HoursPerWeek',data=df,ax=axis[11])
sns.countplot(x='Native',data=df,ax=axis[12])
plt.rcParams.update({'font.size': 12})

## Data Modification

In [None]:
df["CapitalGain"] = df["CapitalGain"].apply(lambda x: 1 if x!=0 else 0)
df["CapitalLoss"] = df["CapitalLoss"].apply(lambda x: 1 if x!=0 else 0)
df["Native"] = df["Native"].apply(lambda x: "US" if x=="United-States" else "Other")
df.head()

In [None]:
X = pd.get_dummies(df, columns=["Sex","Race","Relationship","Occupation","Married","Education","Work Class","Native"])
X.drop("Output",inplace=True,axis=1)
X.drop("Fnlwgt",inplace=True,axis=1)
X.head()

In [None]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)

In [None]:
y = df["Output"]
y.head()

## Testing Models

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
pred = forest.predict(X_test)
acc = accuracy_score(y_test,pred)
print(acc)
print(confusion_matrix(y_test, pred))

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
acc = accuracy_score(y_test,pred)
print(acc)
print(confusion_matrix(y_test, pred))