# KNN Clustering

In [18]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(xtrain, ytrain)
ypred = knn.predict(xtest)
print("Predictions: {}".format(ypred))
score = knn.score(xtest, ytest)
print("Score: {:.4f}".format(score))

Predictions: [1 2 3 ... 2 4 4]
Score: 0.6841


### Hyperparameter Tuning
First Randomized Search Cross Validation, then Grid Search Cross Validation to find the best parameter set up that prevents overfitting and underfitting and eventually leading to the best accuracy score. 

In [121]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {'n_neighbors': [1, 5, 10, 20, 30, 40, 60, 80, 100],
            "weights": ["uniform", "distance"]}

knn = KNeighborsClassifier()
knn_cv = RandomizedSearchCV(knn, param_dist, cv=4)
knn_cv.fit(train_hh, train_pov)
print("Tuned Decision KNN Parameters: {}".format(knn_cv.best_params_))
print("Best score is {}".format(knn_cv.best_score_))

Tuned Decision KNN Parameters: {'weights': 'distance', 'n_neighbors': 10}
Best score is 0.6774341002590278


In [122]:
from sklearn.model_selection import GridSearchCV

param_dist = {'n_neighbors': [8,9,10,11,12,13],
             'weights': ['distance']}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_dist, cv=4)
knn_cv.fit(train_hh, train_pov)
print("Tuned Decision KNN Parameters: {}".format(knn_cv.best_params_))
print("Best score is {}".format(knn_cv.best_score_))

Tuned Decision KNN Parameters: {'n_neighbors': 12, 'weights': 'distance'}
Best score is 0.6812433338412311


In [123]:
#kfoldcv 
from sklearn.model_selection import cross_val_score
knn=KNeighborsClassifier(n_neighbors=12, weights='distance')
scores = cross_val_score(knn, train_hh, train_pov, cv=7)
print("Average Score: {}".format(np.mean(scores)))

Average Score: 0.6839868906728981


In [124]:
knn = KNeighborsClassifier(n_neighbors=12, weights='distance')
knn.fit(train_hh, train_pov)
pov_pred = knn.predict(test_hh)
print("Predictions: {}".format(pov_pred))
score = knn.score(test_hh, test_pov)
print("Score: {:.4f}".format(score))

Predictions: [4 4 4 ... 4 4 4]
Score: 0.5894


In [125]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

print("Classification Report:\n", classification_report(test_pov, pov_pred))
print("Confusion Matrix:\n", confusion_matrix(test_pov, pov_pred))

Classification Report:
              precision    recall  f1-score   support

          1       0.29      0.08      0.12       239
          2       0.29      0.20      0.24       483
          3       0.23      0.06      0.10       366
          4       0.66      0.88      0.75      1725

avg / total       0.51      0.59      0.52      2813

Confusion Matrix:
 [[  19   49   11  160]
 [  22   97   23  341]
 [   8   39   22  297]
 [  16  148   41 1520]]


After tuning our KNN model, we've found that the best parameter is using 8 neighbors and using distance to calculate the weights. This leads to an average score of 0.795 accuracy with our training data. 

# Support Vector Classification

In [20]:

#randomized search cv
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas, 'class_weight':['balanced']}
svc = SVC()
svc_cv = RandomizedSearchCV(svc, param_grid, cv=5)
svc_cv.fit(xtrain, ytrain)
print("Tuned Decision SVC Parameters: {}".format(svc_cv.best_params_))
print("Best score is {}".format(svc_cv.best_score_))

KeyboardInterrupt: 

In [128]:
#grid search cv
Cs = [6, 8, 10, 12, 14]
gammas = [0.06, 0.08, 0.1, 0.12, 0.14]
param_grid = {'C':Cs, 'gamma':gammas, 'class_weight':['balanced']}
svc=SVC()
svc_cv = GridSearchCV(svc, param_grid, cv=4)
svc_cv.fit(train_hh, train_pov)
print("Tuned Decision SVC Parameters: {}".format(svc_cv.best_params_))
print("Best score is {}".format(svc_cv.best_score_))

Tuned Decision SVC Parameters: {'C': 12, 'class_weight': 'balanced', 'gamma': 0.14}
Best score is 0.6801777777777778


In [129]:
svc = SVC(C=20, gamma=0.14, class_weight='balanced')
svc.fit(train_hh, train_pov)
pov_predict = svc.predict(test_hh)
print("Predictions: {}".format(pov_predict))
score = svc.score(test_hh, test_pov)
print("Score: {:.4f}".format(score))

Predictions: [2 4 4 ... 4 4 4]
Score: 0.7014


In [130]:
print("Classification Report:\n", classification_report(test_pov, pov_predict))
print("Confusion Matrix:\n", confusion_matrix(test_pov, pov_predict))

Classification Report:
              precision    recall  f1-score   support

          1       0.47      0.47      0.47       309
          2       0.53      0.52      0.53       647
          3       0.53      0.43      0.48       488
          4       0.80      0.84      0.82      2307

avg / total       0.69      0.70      0.70      3751

Confusion Matrix:
 [[ 146   50   28   85]
 [  50  339   47  211]
 [  36   67  210  175]
 [  79  182  110 1936]]


do kfoldcv to avoid overfitting. redo model on evenly distributed test set in terms of poverty level, to avoid bias

In [131]:
svc = SVC(C=18, gamma=0.08)
scores = cross_val_score(svc, train_hh, train_pov, cv=7)
print('Average Score: ', np.mean(scores))

Average Score:  0.7104002080554396


# Random Forest Trees

In [140]:
#randomized search cv
from sklearn.ensemble import RandomForestClassifier
param_dist = {"max_depth": [3, None],
              "max_features": range(1, 11),
              "min_samples_split": range(2, 11),
              "min_samples_leaf": range(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
rfc = RandomForestClassifier()
rfc_cv = RandomizedSearchCV(rfc, param_dist, cv=5)
rfc_cv.fit(train_hh, train_pov)
print("Tuned Decision RFC Parameters: {}".format(rfc_cv.best_params_))
print("Best score is {}".format(rfc_cv.best_score_))

Tuned Decision RFC Parameters: {'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 5, 'max_depth': None, 'criterion': 'entropy', 'bootstrap': False}
Best score is 0.7626666666666667


In [142]:
train_hh, test_hh, train_pov, test_pov = split()
rfc = RandomForestClassifier(min_samples_split=4, min_samples_leaf=2, max_features=2, max_depth=None, 
                            criterion='gini', bootstrap=False)
rfc.fit(train_hh, train_pov)
pov_pred = rfc.predict(test_hh)
print("Predictions: {}".format(pov_pred))
score = rfc.score(test_hh, test_pov)
print("Score: {:.4f}".format(score))

TypeError: 'NoneType' object is not iterable

In [134]:
print("Classification Report:\n", classification_report(test_pov, pov_pred))
print("Confusion Matrix:\n", confusion_matrix(test_pov, pov_pred))

Classification Report:
              precision    recall  f1-score   support

          1       0.80      0.39      0.52       309
          2       0.70      0.51      0.59       647
          3       0.73      0.32      0.45       488
          4       0.76      0.96      0.85      2307

avg / total       0.75      0.75      0.72      3751

Confusion Matrix:
 [[ 121   33   11  144]
 [  16  330   24  277]
 [   2   48  157  281]
 [  13   60   22 2212]]


It seems like for all three models, after removing PCA, we get extremely high classification scores. However, we need to be concerned with having an imbalanced dataset. The majority of our datapoints in our training set have been labeled as 'Non-vulnerable'. We will perform these techniques again after resampling our data to be more representative and prevent overfitting. 

Despite this, our precision scores, recall scores, and confusion matrices look really good.

In [135]:
train['Target'].value_counts()/sum(train['Target'].value_counts())

4    0.620201
2    0.170328
3    0.128946
1    0.080525
Name: Target, dtype: float64

'Non-Vulnerable' datapoints make up over 62% of the data.

In [136]:
#perform Random Forest Classification with balanced class
split()
rfc = RandomForestClassifier(min_samples_split=4, min_samples_leaf=2, max_features=2, max_depth=None, 
                            criterion='gini', bootstrap=False, class_weight='balanced')
rfc.fit(train_hh, train_pov)
pov_pred = rfc.predict(test_hh)
print("Predictions: {}".format(pov_pred))
score = rfc.score(test_hh, test_pov)
print("Score: {:.4f}".format(score))
print("Classification Report:\n", classification_report(test_pov, pov_pred))
print("Confusion Matrix:\n", confusion_matrix(test_pov, pov_pred))

Predictions: [4 4 3 ... 4 4 4]
Score: 0.7433
Classification Report:
              precision    recall  f1-score   support

          1       0.57      0.60      0.59       309
          2       0.58      0.62      0.60       647
          3       0.53      0.52      0.53       488
          4       0.86      0.84      0.85      2307

avg / total       0.75      0.74      0.74      3751

Confusion Matrix:
 [[ 186   47   26   50]
 [  48  400   68  131]
 [  30   72  253  133]
 [  61  170  127 1949]]


We get much better scores overall, and improved recall scores for each class.

In [137]:
#oversampling the training data
split()

#SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE()
x_train, y_train = sm.fit_sample(train_hh, train_pov)

#SVC
svc = SVC(C=18, gamma=0.08)
svc.fit(train_hh, train_pov)
pov_predict = svc.predict(test_hh)
print('SVC:')
print("Predictions: {}".format(pov_predict))
score = svc.score(test_hh, test_pov)
print("Score: {:.4f}".format(score))
print("Classification Report:\n", classification_report(test_pov, pov_pred))
print("Confusion Matrix:\n", confusion_matrix(test_pov, pov_pred))
print('--------------------------------------------')

#KNN
knn = KNeighborsClassifier(n_neighbors=60, weights='distance')
knn.fit(train_hh, train_pov)
pov_pred = knn.predict(test_hh)
print('KNN: ')
print("Predictions: {}".format(pov_pred))
score = knn.score(test_hh, test_pov)
print("Score: {:.4f}".format(score))
print("Classification Report:\n", classification_report(test_pov, pov_predict))
print("Confusion Matrix:\n", confusion_matrix(test_pov, pov_predict))

ModuleNotFoundError: No module named 'imblearn'

In [None]:
import xgboost as xgb

xgb = xgb.XGBClassifier()


# Fitting to test data

In [138]:
rfc = RandomForestClassifier(min_samples_split=4, min_samples_leaf=2, max_features=2, max_depth=None, 
                            criterion='gini', bootstrap=False, class_weight='balanced')
X = train.drop(['Target'], axis=1)
y = train.Target


Xnum = X.select_dtypes(include=np.number)
testnum = test.select_dtypes(include=np.number)

Xcat = X.select_dtypes(include='object')
testcat = test.select_dtypes(include='object')

names = Xnum.columns

#standardize
scaler = StandardScaler()
scaler.fit(Xnum)
Xnum = pd.DataFrame(scaler.transform(Xnum), columns=names)
testnum = pd.DataFrame(scaler.transform(testnum), columns=names)
X = pd.concat([Xnum, Xcat], axis=1)
test = pd.concat([testnum, testcat], axis=1)
X = X.dropna(axis=0)

rfc.fit(X, y)
pred = rfc.predict(test)
pred 

ValueError: could not convert string to float: 'Public'