In [14]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [15]:
df = pd.read_csv('salarydata.csv')

In [16]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [17]:
def object_numeric_bifurcation(df):
    object = [feature for feature in df.columns if df[feature].dtypes == 'O' and feature not in 'income']
    
    numeric = [feature for feature in df.columns if df[feature].dtypes != 'O']
    
    return object, numeric

In [18]:
object, numeric = object_numeric_bifurcation(df)

In [19]:
def feature_engineering(df):
    feature = df['native-country'].value_counts().sort_values(ascending = False).tail(20).index
    
    df['native-country'] = np.where(df['native-country'].isin([' Japan', ' Laos', ' Columbia', ' Iran', ' Thailand', ' Yugoslavia',
       ' Peru', ' Portugal', ' Greece', ' Nicaragua', ' Trinadad&Tobago',
       ' Ireland', ' Cambodia', ' Ecuador', ' Outlying-US(Guam-USVI-etc)',
       ' Scotland', ' Hungary', ' Hong', ' Holand-Netherlands', ' France']), 'All', df['native-country'])
    df['education'] = np.where(df['education'].isin([' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th']), 'School', df['education'])
    
    df.drop('fnlwgt', axis = 1, inplace = True)
    
    df['capital-gain'] = np.where(df['capital-gain'] < 0, 0, df['capital-gain'])
    df['capital-loss'] = np.where(df['capital-loss'] < 0, 0, df['capital-loss'])
    
    df['income'] = np.where(df['income'] == ' <=50K', 1, 0)
    
    return df

In [20]:
df = feature_engineering(df)

In [21]:
def ohe(df):
    from sklearn.preprocessing import LabelEncoder
    from sklearn.preprocessing import OneHotEncoder
    for feature in object:
        df[feature] = LabelEncoder().fit_transform(df[feature])
    
    final_encoded = []
    for feature in object:
        encoded_data = OneHotEncoder().fit_transform(df[feature].values.reshape(-1,1)).toarray()
        n = df[feature].nunique()
        cols = ['{}_{}'.format(feature, n) for n in range(1, n+1)]
        encoded_df = pd.DataFrame(encoded_data, columns = cols)
        encoded_df.index = df.index
        final_encoded.append(encoded_df)
    final_data = pd.concat([df, *final_encoded], axis = 1)
    
    return final_data

In [22]:
final_data = ohe(df)

In [25]:
def data_segregation(df):
    df.drop(object, axis = 1, inplace = True)
    
    x = df.drop('income', axis = 1)
    y = df.iloc[:, -1]
    
    return x, y

In [26]:
x, y = data_segregation(final_data)

In [29]:
def selection_of_variables(x, y, final_data):
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2
    
    best_features = SelectKBest(score_func = chi2, k = 'all')
    
    fit = best_features.fit(x, y)
    
    df_scores = pd.DataFrame(fit.scores_)
    dfcol = pd.DataFrame(x.columns)
    
    feature_score = pd.concat([df_scores, dfcol], axis = 1)
    feature_score.columns = ['score', 'variable']
    
    final = feature_score.groupby('variable').sum().sort_values(by = 'score', ascending = False).head(40).index
    
    new_data = final_data[final]
    
    return new_data, final
    

In [30]:
new_data, final = selection_of_variables(x, y, final_data)

In [33]:
def scaling(new_data):
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    new_data = sc.fit_transform(new_data)
    
    return new_data

In [34]:
new_data = scaling(new_data)

In [35]:
def data_balance(new_data, y):
    from imblearn.over_sampling import RandomOverSampler
    os = RandomOverSampler(0.4)
    xover, yover = os.fit_resample(new_data, y)
    
    return xover, yover

In [36]:
xover, yover = data_balance(new_data, y)



In [39]:
def over_sampling_values(y, yover):
    
    from collections import Counter
    print('Before sampling:{}'.format(Counter(y)))
    print('After sampling:{}'.format(Counter(yover)))

In [40]:
over_sampling_values(y, yover)

Before sampling:Counter({0.0: 18739, 1.0: 205})
After sampling:Counter({0.0: 18739, 1.0: 7495})


In [51]:
def classification_model(xover, yover):
    
    from sklearn.linear_model import LogisticRegression
    
    classifier = LogisticRegression()
    
    from sklearn.model_selection import train_test_split
    xtrain, xtest, ytrain, ytest = train_test_split(xover, yover,stratify = yover, test_size = 0.2, random_state = 0)
    
    classifier.fit(xtrain, ytrain)
    
    ypred = classifier.predict(xtest)
    
    from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
    
    print("Confusion Matrix{}".format(confusion_matrix(ytest, ypred)))
    print("Classification Report {}".format(classification_report(ytest, ypred)))
    print("Accuracy Score {}".format(accuracy_score(ytest, ypred)))

In [53]:
classification_model(xover, yover)

Confusion Matrix[[3748    0]
 [   0 1499]]
Classification Report               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3748
         1.0       1.00      1.00      1.00      1499

    accuracy                           1.00      5247
   macro avg       1.00      1.00      1.00      5247
weighted avg       1.00      1.00      1.00      5247

Accuracy Score 1.0
