## linear regression

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import warnings

warnings.filterwarnings("ignore")

### data preprocess

In [3]:
# load data & show head
df = pd.read_csv('../cs5228/train.csv')
print(df.shape)
df.head()

(24421, 14)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,exceeds50K
0,30,?,147215,Some-college,10,Never-married,?,Own-child,Female,0,0,30,United-States,0
1,60,Private,173960,Bachelors,13,Divorced,Prof-specialty,Not-in-family,Female,0,0,42,United-States,0
2,52,?,105428,Some-college,10,Married-civ-spouse,?,Husband,Male,0,0,12,United-States,0
3,37,Private,112497,Bachelors,13,Married-civ-spouse,Sales,Husband,Male,0,0,60,United-States,0
4,63,Private,137843,Some-college,10,Married-civ-spouse,Sales,Husband,Male,7298,0,48,United-States,1


In [4]:
# replace ' ?' by mode
for feature in df.columns:
    print(feature, ' ', len(df[df[feature] == ' ?']))
df[df ==' ?'] = np.nan
for feature in ['workclass', 'occupation', 'native-country']:
    df[feature].fillna(df[feature].mode()[0], inplace=True)
print(df.shape)

age   0
workclass   1392
fnlwgt   0
education   0
education-num   0
marital-status   0
occupation   1399
relationship   0
sex   0
capital-gain   0
capital-loss   0
hours-per-week   0
native-country   410
exceeds50K   0
(24421, 14)


In [5]:
# split features and labels
X = df.drop(['exceeds50K'],axis=1)
Y = df['exceeds50K']
X.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
0,30,Private,147215,Some-college,10,Never-married,Prof-specialty,Own-child,Female,0,0,30,United-States
1,60,Private,173960,Bachelors,13,Divorced,Prof-specialty,Not-in-family,Female,0,0,42,United-States
2,52,Private,105428,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,Male,0,0,12,United-States


### read test.csv, preprocess

In [6]:
df_test = pd.read_csv('../cs5228/test.csv')
print(df_test.shape)
df_test[df_test==' ?'] = np.nan
print(df_test.isnull().sum())
df_test.head()

(24421, 13)
age                  0
workclass         1407
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1410
relationship         0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     447
dtype: int64


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
0,23,Private,32732,Some-college,10,Married-civ-spouse,Sales,Husband,Male,0,0,25,United-States
1,69,Private,165017,HS-grad,9,Widowed,Machine-op-inspct,Unmarried,Male,2538,0,40,United-States
2,27,Private,36440,Bachelors,13,Never-married,Sales,Not-in-family,Female,0,0,40,United-States
3,40,Private,182217,Some-college,10,Married-civ-spouse,Other-service,Wife,Female,0,0,40,Scotland
4,24,Private,89347,11th,7,Never-married,Machine-op-inspct,Not-in-family,Female,0,0,40,United-States


In [7]:
# replace ' ?' by mode
for feature in ['workclass', 'occupation', 'native-country']:
    df_test[feature].fillna(df_test[feature].mode()[0], inplace=True)
print(df_test.shape)

(24421, 13)


## embedding, train&test data together

In [9]:
X_train ,X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.3, random_state = 0)
X_test = df_test

# total feature count 13
feature_list = X.columns

In [10]:
# transfer string feature into int

# features in string to be encoded
encoding_features = ['workclass','education','marital-status','occupation','relationship','sex','native-country']

# to include all possible values
X_all = pd.concat([X, X_test], axis =0)
for feature in encoding_features:
    encoder = preprocessing.LabelEncoder()
    encoder.fit(X_all[feature])
    X_train[feature] = encoder.transform(X_train[feature])
    X_val[feature] = encoder.transform(X_val[feature])
    X_test[feature] = encoder.transform(X_test[feature])

X_train.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
13797,47,5,168109,11,9,2,4,0,1,15024,0,50,38
13680,39,3,81487,15,10,0,3,4,0,0,625,40,38
17228,41,4,277858,9,13,6,3,1,0,0,0,45,38


In [11]:
# feature scaling

scaler = preprocessing.StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = feature_list)
X_val = pd.DataFrame(scaler.transform(X_val), columns = feature_list)
X_test = pd.DataFrame(scaler.transform(X_test), columns = feature_list)
X_train.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
0,0.604047,1.690949,-0.199511,0.179212,-0.424562,-0.420143,-0.551163,-0.899892,0.701437,1.853092,-0.214843,0.773752,0.261615
1,0.02,-0.090841,-1.013496,1.212852,-0.035004,-1.749512,-0.802633,1.589931,-1.425645,-0.145981,1.364409,-0.037755,0.261615
2,0.166012,0.800054,0.831799,-0.337608,1.133669,2.238596,-0.802633,-0.277436,-1.425645,-0.145981,-0.214843,0.367998,0.261615


## logistic regression training

In [13]:
clf = LogisticRegression()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_val)
print('accuracy', accuracy_score(Y_val,Y_pred))
print(classification_report(Y_val,Y_pred))

accuracy 0.8322642281970793
              precision    recall  f1-score   support

           0       0.85      0.95      0.90      5573
           1       0.73      0.47      0.57      1754

    accuracy                           0.83      7327
   macro avg       0.79      0.71      0.73      7327
weighted avg       0.82      0.83      0.82      7327



## predict test data

In [12]:
Y_test = clf.predict(X_test)
print(Y_test.shape)

(24421,)


In [13]:
# wirte into file
df_res = pd.read_csv('../cs5228/sample_submission.csv')
df_res['prediction'] = Y_test
print(df_res.head())
df_res.to_csv('../submissions/lr_submission.csv', index = False)

   id  prediction
0   1           0
1   2           0
2   3           0
3   4           0
4   5           0
