In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.3-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.3


In [None]:
from ucimlrepo import fetch_ucirepo

def load_dataset():
  # fetch dataset
  adult = fetch_ucirepo(id=2)

  # data (as pandas dataframes)
  X = adult.data.features
  y = adult.data.targets

  # metadata
  print(adult.metadata)

  # variable information
  variables_info = adult.variables
  print(variables_info)

  return X, y, variables_info

In [None]:
X_raw, y_raw, var_info = load_dataset()

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG

In [None]:
import pandas as pd
def print_unique_feature_values(X, var_info=var_info):
  all_features = var_info[var_info.role == 'Feature']
  features = all_features[all_features['type'] =='Categorical']
  # Feature 'sex' is a binary type which is categorical too
  other_categorical = all_features[all_features['type'] == 'Binary']
  features = pd.concat((features, other_categorical), axis=0)
  df = pd.DataFrame(columns=['feature', 'unique_value_count', 'unique_values', 'has \'?\''])
  for i, feature in features.iterrows():
    name = feature['name']
    unique = X[name].unique().T
    df.loc[len(df.index)] = [name, len(unique), unique, '?' in unique.tolist()]
  display(df)
print_unique_feature_values(X_raw)

Unnamed: 0,feature,unique_value_count,unique_values,has '?'
0,workclass,10,"[State-gov, Self-emp-not-inc, Private, Federal...",True
1,education,16,"[Bachelors, HS-grad, 11th, Masters, 9th, Some-...",False
2,marital-status,7,"[Never-married, Married-civ-spouse, Divorced, ...",False
3,occupation,16,"[Adm-clerical, Exec-managerial, Handlers-clean...",True
4,relationship,6,"[Not-in-family, Husband, Wife, Own-child, Unma...",False
5,race,5,"[White, Black, Asian-Pac-Islander, Amer-Indian...",False
6,native-country,43,"[United-States, Cuba, Jamaica, India, ?, Mexic...",True
7,sex,2,"[Male, Female]",False


In [None]:
def print_unknowns(X):
  all_feature_names = var_info[var_info.role == 'Feature']['name']
  df = pd.DataFrame(columns=['feature', '\'?\'count'])
  for f_name in all_feature_names:
    count = len(X[X[f_name] == '?'].index)
    df.loc[len(df.index)] = [f_name, count]
    #print(f'{f_name} : {count}')
  display(df)
print(X_raw.shape)
print_unknowns(X_raw)

(48842, 14)


Unnamed: 0,feature,'?'count
0,age,0
1,workclass,1836
2,fnlwgt,0
3,education,0
4,education-num,0
5,marital-status,0
6,occupation,1843
7,relationship,0
8,race,0
9,sex,0


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

def preprocess(features, labels, var_info):
  # Create copies to avoid changing raw features and labels
  X = features.copy()
  y = labels.copy()

  X.isna().sum()

  # Clean the data - remove missing value rows
  features_missing_values = var_info[var_info['missing_values'] == 'yes']
  rows_to_remove = set()
  for index, row in features_missing_values.iterrows():
    feature_name = row['name']
    rows_to_remove.update(X.index[X[feature_name].isna()].tolist());

  # Clean further - remove data with '?' values in them
  all_feature_names = var_info[var_info.role == 'Feature']['name']
  for feature_name in all_feature_names:
    rows_to_remove.update(X[X[feature_name] == '?'].index)

  X.drop(rows_to_remove, axis = 0, inplace=True)
  y.drop(rows_to_remove, axis = 0, inplace=True)

  # Encode categorical feature values and store encoders to later decode them
  categorical = var_info[var_info['type'] == 'Categorical']
  encoders = pd.DataFrame(columns=['name', 'encoders'])
  for index, row in categorical.iterrows():
    feature_name = row['name']
    le = LabelEncoder()
    X[feature_name] = le.fit_transform(X[feature_name])
    encoders.loc[len(encoders.index)] = [feature_name, le]
  le = LabelEncoder()
  X['sex'] = le.fit_transform(X['sex'])
  encoders.loc[len(encoders.index)] = ['sex', le]
  le = LabelEncoder()
  y['income'].replace({'<=50K.': '<=50K', '>50K.': '>50K'}, inplace=True)
  y['income'] = le.fit_transform(y['income'])
  encoders.loc[len(encoders.index)] = ['income', le]

  return X, y, encoders

In [None]:
X, y, encoders = preprocess(X_raw, y_raw, var_info)

In [None]:
# importing required libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler ()
x_train = sc.fit_transform (x_train)
x_test = sc.transform (x_test)

LOGISTIC REGRESSION MODEL

In [None]:
from sklearn.linear_model import LogisticRegression
classifier_log = LogisticRegression ()
classifier_log.fit (x_train, y_train)
y_pred_log = classifier_log.predict (x_test)
cm_log = confusion_matrix (y_test, y_pred_log)
acc_log = accuracy_score (y_test, y_pred_log)

  y = column_or_1d(y, warn=True)


SVM MODEL

In [None]:
from sklearn.svm import SVC
classifier_svm = SVC (kernel = 'rbf', random_state = 0)
classifier_svm.fit (x_train, y_train)
y_pred_svm = classifier_svm.predict (x_test)
cm_svm = confusion_matrix (y_test, y_pred_svm)
acc_svm = accuracy_score (y_test, y_pred_svm)

  y = column_or_1d(y, warn=True)


ACCURACY COMPARISON

In [None]:
prediction_columns = ["NAME OF MODEL", "ACCURACY SCORE"]
df_pred = {"NAME OF MODEL" : ["LOGISTIC REGRESSION", "SVM"],
           "ACCURACY SCORE " : [acc_log, acc_svm]}
df_predictions = pd.DataFrame (df_pred)
df_predictions

Unnamed: 0,NAME OF MODEL,ACCURACY SCORE
0,LOGISTIC REGRESSION,0.821227
1,SVM,0.846987


HYPER-PARAMETER TUNING

In [None]:
from sklearn.model_selection import GridSearchCV

LOGISTIC REGRESSION MODEL

In [None]:
parameters = [{'penalty': ['l2'], 'C': [ 0.1, 1],
                'solver': ['newton-cg', 'liblinear']}]
grid_search = GridSearchCV(estimator = classifier_log,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(x_train, y_train)
best_accuracy_log = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy_log)
print(best_parameters)

  y = column_or_1d(y, warn=True)


0.8198856193642424
{'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}


SVM MODEL

In [None]:
parameters = [{'C': [0.1, 1], 'kernel': ['linear', 'rbf'],
                'gamma': [0.1, 0.2]}]
grid_search = GridSearchCV(estimator = classifier_svm,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           n_jobs = 1)
grid_search.fit(x_train, y_train)
best_accuracy_svm = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy_svm)
print(best_parameters)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

0.8455924630857844
{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}


FINAL ACCURACIES AFTER HYPER-PARAMETER TUNING

In [None]:
import pandas as pd
prediction_columns = ["NAME OF MODEL", "ACCURACY SCORE", "BEST ACCURACY (AFTER HYPER-PARAMETER TUNING)"]
df_pred = {"NAME OF MODEL" : ["LOGISTIC REGRESSION", "SVM"],
           "ACCURACY SCORE " : [acc_log, acc_svm],
           "BEST ACCURACY (AFTER HYPER-PARAMETER TUNING)" : [best_accuracy_log, best_accuracy_svm]}
df_predictions = pd.DataFrame (df_pred)
df_predictions

Unnamed: 0,NAME OF MODEL,ACCURACY SCORE,BEST ACCURACY (AFTER HYPER-PARAMETER TUNING)
0,LOGISTIC REGRESSION,0.821227,0.819886
1,SVM,0.846987,0.845592
