In [15]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [30]:
#adding columns headers to the data
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'salary'
]

data = pd.read_csv('C:/Users/tanya/OneDrive/ProjectsSubmitted/ALY 6020/adult-all.csv', names=columns)
data


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
48838,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


In [34]:
#Handling missing values
data.isnull().sum()
data.replace('?', np.nan, inplace=True)
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


In [19]:
data.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [20]:
# Convert numerical columns to appropriate data types
numerical_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
data[numerical_columns] = data[numerical_columns].astype(float)

In [21]:
#check for duplicates
data.drop_duplicates(inplace=True)

In [22]:
# Convert the 'salary' column to binary values (0 for <=50K, 1 for >50K)
data['salary'] = data['salary'].map({'<=50K': 0, '>50K': 1})


In [23]:
# Split the data into features (X) and the target variable (y)
X = data.drop('salary', axis=1)
y = data['salary']

In [24]:
# Encode categorical features
categorical_columns = X.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for column in categorical_columns:
    X[column] = encoder.fit_transform(X[column])

In [65]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Feature Importance Analysis
# Assuming X_train and y_train are your feature and target variables
# Example using Recursive Feature Elimination (RFE) with a Random Forest Classifier
estimator = RandomForestClassifier()
selector = RFE(estimator, n_features_to_select=10)  # Choose the number of important features
selector = selector.fit(X_train, y_train)
selected_features = X_train.columns[selector.support_]

# Tune K Value
param_grid = {'n_neighbors': [5, 11, 17, 21, 25]}  # Define a range of K values
knn = KNeighborsClassifier()
f1_scorer = make_scorer(f1_score, average='weighted')
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring=f1_scorer)
grid_search.fit(X_train[selected_features], y_train)

# Get the best K value from the grid search
best_k = grid_search.best_params_['n_neighbors']
print("Best K Value:", best_k)


Best K Value: 11


In [67]:
# Part 2: Building a k-NN Model

# Choose a value for K
k = 11
knn = KNeighborsClassifier(n_neighbors=k)

In [68]:
# Evaluate the model using cross-validation
scores = cross_val_score(knn, X_train, y_train, cv=5)  # You can adjust the number of cross-validation folds

# Calculate the average accuracy
accuracy = scores.mean()

# Fit the model to the training data
knn.fit(X_train, y_train)

# Predict on the test data
y_pred = knn.predict(X_test)

In [69]:
# Evaluate the model on the test data
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

test_accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [70]:
# Print results
print(f"Model Accuracy (Cross-Validation): {accuracy:.2f}")
print(f"Model Accuracy (Test Data): {test_accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy (Cross-Validation): 0.79
Model Accuracy (Test Data): 0.79
Confusion Matrix:
[[7087  276]
 [1777  618]]
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.96      0.87      7363
           1       0.69      0.26      0.38      2395

    accuracy                           0.79      9758
   macro avg       0.75      0.61      0.62      9758
weighted avg       0.77      0.79      0.75      9758

