In [1]:
import numpy as np
import pandas as pd

In [2]:
def read_input(data_file,test=False):
    '''
    This reads inputs files and assigns column names since the data has no column names.
    Skipping initial space since the data has space which makes difficult to get values by names.
    Column names: ["age", "workclass", "fnlwgt", "education", "education_num", 
           "marital_status", "occupation", "relationship", "race", "gender", 
           "capital_gain", "capital_loss", "hours_per_week", "native_country", "income_bracket"]
        
    Args:
    * data_file: Input file path
    * test=False; This takes care of skipping initial row in case of test data. The test data is like that.
    
    Return:
    Dataframe with proper column names
    '''
    columns = ["age", "workclass", "fnlwgt", "education", "education_num", 
           "marital_status", "occupation", "relationship", "race", "gender", 
           "capital_gain", "capital_loss", "hours_per_week", "native_country", "income_bracket"]
    if test:
        df = pd.read_csv(data_file,names=columns,skipinitialspace=True,skiprows=1)
    else:
        df = pd.read_csv(data_file,names=columns,skipinitialspace=True)
    return df

In [26]:
def primary(x):
    '''
    Reduces categories ['1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th'] and makes it "Primary"
        
    Args:
    * x: String
    
    Return:
    "Primary"/ x
    '''
    if x in ['1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th']:
        return ' Primary'
    else:
        return x

In [4]:
def native(country):
    '''
    Grouping countries based on region
    Args:
    * x: Country Name
    
    Return:
    Region for country

    '''
    if country in ['United-States',  'Unknown','Canada','Columbia']:
        return 'North America'
    elif country in ['Cuba','Mexico','Dominican-Republic','Puerto-Rico','Outlying-US(Guam-USVI-etc)','Jamaica','Haiti'
                     ,'Honduras','El-Salvador','Guatemala','Nicaragua']:
        return 'Central America'
    elif country in ['South','Ecuador','Peru','Trinadad&Tobago']:
        return 'South America'
    elif country in ['England', 'Germany','Italy','Portugal','France', 'Yugoslavia','Scotland','Greece', 
                     'Ireland',' Philippines','Hungary','Holand-Netherlands','Ireland','Poland']:
        return 'European'
    elif country in ['India', 'Iran', 'Philippines','Cambodia','Thailand', 'Laos', 'Taiwan', 'Japan', 'China', 'Vietnam',
                     'Hong']:
        return 'Eastern'    
    else: 
        return country 

In [27]:
def normalize(data):
    '''
    data: dataframe which you want to normalize
    returns: normalized dataframe and normalization params
    '''
    fmean=np.mean(data)
    frange=np.amax(data)-np.amin(data)
    normalization_params = [fmean,frange]
    data-=fmean
    data/=frange
    return data,normalization_params

In [28]:
def preprocess_train(df):
    '''
    This is a helper function which is developed using the analysis done.
    '''
    df.replace('?',np.nan,inplace=True)
    df['workclass'].fillna('Not working now', inplace=True)
    df['occupation'].fillna('Unknown',inplace=True)
    df['native_country'].fillna('Unknown',inplace=True)
    #df['income_bracket'] = df['income_bracket'].apply(lambda x: 1 if x=='>50K' else 0)
    df['education'] = df['education'].apply(primary)
    df['fnlwgt'] = df['fnlwgt'].apply(lambda x: np.log1p(x))
    df['native_country']=df['native_country'].apply(lambda country: native(country))
    df=pd.get_dummies(df)
    df,normalization_params=normalize(df)
    return df,normalization_params

In [7]:
def preprocess_test(df,normalization_params):
    '''
    This is a helper function which is developed using the analysis done.
    '''
    df.replace('?',np.nan,inplace=True)
    df['workclass'].fillna('Not working now', inplace=True)
    df['occupation'].fillna('Unknown',inplace=True)
    df['native_country'].fillna('Unknown',inplace=True)
    #df['income_bracket'] = df['income_bracket'].apply(lambda x: 1 if x=='>50K' else 0)
    df['education'] = df['education'].apply(primary)
    df['fnlwgt'] = df['fnlwgt'].apply(lambda x: np.log1p(x))
    df['native_country']=df['native_country'].apply(lambda country: native(country))
    df=pd.get_dummies(df)
    #normalizing
    fmean,frange=normalization_params
    df-=fmean
    df/=frange
    return df

In [8]:
def weightInitialization(n_features):
    w = np.zeros((1,n_features))
    b = 0
    return w,b

def sigmoid_activation(result):
    final_result = 1/(1+np.exp(-result))
    return final_result


In [9]:
def model_optimize(w, b, X, Y):
    m = X.shape[0]
    
    #Prediction
    final_result = sigmoid_activation(np.dot(w,X.T)+b)
    Y_T = Y.T
    cost = (-1/m)*(np.sum((Y_T*np.log(final_result)) + ((1-Y_T)*(np.log(1-final_result)))))
    #
    
    #Gradient calculation
    dw = (1/m)*(np.dot(X.T, (final_result-Y.T).T))
    db = (1/m)*(np.sum(final_result-Y.T))
    
    grads = {"dw": dw, "db": db}
    
    return grads, cost

In [10]:
def model_train(w, b, X, Y, learning_rate, no_iterations):
    costs = []
    for i in range(no_iterations):
        #
        grads, cost = model_optimize(w,b,X,Y)
        #
        dw = grads["dw"]
        db = grads["db"]
        #weight update
        w = w - (learning_rate * (dw.T))
        b = b - (learning_rate * db)
        #
        
        if (i % 100 == 0):
            costs.append(cost)
            print("Cost after %i iteration is %f" %(i, cost))
    
    #final parameters
    coeff = {"w": w, "b": b}
    gradient = {"dw": dw, "db": db}
    
    return coeff, gradient, costs

In [11]:
def convertToClasses(final_pred, m):
    y_pred = np.zeros((1,m))
    for i in range(final_pred.shape[1]):
        if final_pred[0][i] > 0.5:
            y_pred[0][i] = 1
    return y_pred

In [12]:
def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
    # Compute accuracy for each possible representation
    score = y_true == y_pred

    return np.average(score, weights=sample_weight)


In [22]:
def train_with_file(data_file, iters):
    """Trains a logisitc regression classifier.

  Args:
    data_file: a path to a csv file containing training data, without headers.
    iters: the number of iterations to use when training the classifier

  Returns:
    weights: a column vector (1d numpy array) containing the weights learned in your classifier.
    normalization_params: a dict mapping column names to (min, max) values from the training set
  """
    df=read_input(data_file)
    #xtrain and y_train
    X_train= df.drop('income_bracket', axis=1)
    y_train = df['income_bracket'].apply(lambda x: 1 if x=='>50K' else 0)
    
    #preprocess
    X_train,normalization_params=preprocess_train(X_train)

    n_features = X_train.shape[1]
    print('Number of Features', n_features)
    w, b = weightInitialization(n_features)
   
    #Gradient Descent
    coeff, gradient, costs = model_train(w, b, X_train.values, y_train.values, learning_rate=0.001,no_iterations=iters)
    #weights=coeff
    #print(coeff['w'])
    #print(coeff['b'])
    m_tr =  X_train.shape[0]
    final_train_pred = sigmoid_activation(np.dot(w,X_train.values.T)+b)
    labels = np.round(final_train_pred,0)
    y_tr_pred = convertToClasses(final_train_pred, m_tr)
    #print('Training Accuracy',accuracy_score(y_tr_pred.T, y_train.values))
    return coeff, normalization_params
    
def classify(data_file, weights, normalization_params):
    """Classifies data based on the supplied logistic regression weights.

    Args:
    data_file: a path to a csv file containing test data, without headers.
    weights: a column vectors containing the weights learned during training.
    normalization_params: a dict mapping column names to (min, max) values from the training set

    Returns:
    a column vector containing either a 1 or a 0 for each row in data_file
    """
    test=read_input(data_file,True)
    X_test = test.drop('income_bracket', axis=1)
    y_test = test['income_bracket'].apply(lambda x: 1 if x=='>50K.' else 0)
    X_test=preprocess_test(X_test,normalization_params)
    w = weights["w"]
    b = weights["b"]
    final_test_pred = sigmoid_activation(np.dot(w,X_test.values.T)+b)
    m_ts =  test.shape[0]
    labels = convertToClasses(final_test_pred, m_ts)
    print('Test Accuracy',accuracy_score(labels.T, y_test.values))
    return labels

# Our grading program will use these functions as follows:
def sample_main():
    weights, normalization_params = train_with_file('adult-training.csv', 1000)
    labels = classify('adult-test.csv', weights, normalization_params)

In [23]:
%time sample_main()

Number of Features 65
Cost after 0 iteration is 0.693147
Cost after 100 iteration is 0.683225
Cost after 200 iteration is 0.673746
Cost after 300 iteration is 0.664690
Cost after 400 iteration is 0.656037
Cost after 500 iteration is 0.647767
Cost after 600 iteration is 0.639861
Cost after 700 iteration is 0.632301
Cost after 800 iteration is 0.625071
Cost after 900 iteration is 0.618152
Test Accuracy 0.7555758550254775
Wall time: 5.92 s


In [None]:
# from sklearn.metrics import accuracy_score
# from sklearn.linear_model import LogisticRegression
# df=read_input('adult-training.csv')
# #xtrain and y_train
# X_train= df.drop('income_bracket', axis=1)
# y_train = df['income_bracket'].apply(lambda x: 1 if x=='>50K' else 0)

# #preprocess
# X_train,normalization_params=preprocess_train(X_train)
# model=LogisticRegression()
# model.fit(X_train,y_train)


# test=read_input('adult-test.csv',True)
# X_test = test.drop('income_bracket', axis=1)
# y_test = test['income_bracket'].apply(lambda x: 1 if x=='>50K.' else 0)
# X_test=preprocess_test(X_test,normalization_params)

# y_test_pred=model.predict(X_test)

# print("Accuracy using Scikit learn is "accuracy_score(y_test_pred, y_test))