# Modeling

### Imports

In [1]:
#import statements
import pandas as pd
import numpy as np

#data visualization
import matplotlib.pyplot as ply
import seaborn as sns

#sci-kit learn
import sklearn
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.dummy import DummyClassifier


from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn. linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

### Import cleaned data

In [2]:
!ls

2.0-Microsoft-Movie-Analysis-AMA-Productions
collab
dsc-data-science-env-config
dsc-linear-regression-lab
dsc-phase-1-project-v2-4
dsc-sklearn-preprocessing-lab
P2-SeattleHousing
Real estate investment opportunities in King county.pptx
Tanzanian-Water-Well-Status-Classification
Topic-2
Untitled.ipynb


In [3]:
df = pd.read_csv('Tanzanian-Water-Well-Status-Classification/data/water_well_train_clean.csv')

In [4]:
df.shape

(59400, 44)

In [5]:
df['date_recorded'].value_counts()

2011-03-15    572
2011-03-17    558
2013-02-03    546
2011-03-14    520
2011-03-16    513
             ... 
2004-01-07      1
2011-09-27      1
2011-09-06      1
2002-10-14      1
2011-08-31      1
Name: date_recorded, Length: 356, dtype: int64

In [6]:
df['district_code']  = df.district_code.astype('str')

In [7]:
df['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [8]:
df['status_group'] = df['status_group'].str.replace('functional needs repair','functional')

In [9]:
df['status_group'].value_counts()

functional        36576
non functional    22824
Name: status_group, dtype: int64

In [54]:
df['basin'].count()

59400

In [None]:
region_codes

In [47]:
df.columns.tolist()

['Unnamed: 0',
 'id',
 'amount_tsh',
 'date_recorded',
 'funder',
 'gps_height',
 'installer',
 'longitude',
 'latitude',
 'wpt_name',
 'num_private',
 'basin',
 'subvillage',
 'region',
 'region_code',
 'district_code',
 'lga',
 'ward',
 'population',
 'public_meeting',
 'recorded_by',
 'scheme_management',
 'scheme_name',
 'permit',
 'construction_year',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'quantity_group',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group',
 'id_label',
 'status_group',
 'date_recorded_datetime']

# First Model

## Setting up Pipelines

First model will include the following features:
    
amount_tsh,
installer, 
gps_height, 
basin, 
region, 
district_code, 
lga, 
population, 
public_meeting, 
management, 
permit, 
extraction_type, 
payment, 
water_quality, 
quantity, 
source, 
waterpoint_type

With the target:
status_group

In [10]:
X = df.drop('status_group', axis=1)

In [11]:
y = df['status_group']

In [12]:
picked_features = ['amount_tsh', 'gps_height', 'basin', 'region', 'lga', 'population', 
                   'public_meeting', 'management', 'permit', 'extraction_type', 'payment', 'water_quality', 'quantity', 
                   'source', 'waterpoint_type', 'district_code']
len(picked_features)

16

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [14]:
X_picked = X[picked_features]

In [15]:
X_picked.head()

Unnamed: 0,amount_tsh,gps_height,basin,region,lga,population,public_meeting,management,permit,extraction_type,payment,water_quality,quantity,source,waterpoint_type,district_code
0,6000.0,1390,Lake Nyasa,Iringa,Ludewa,109,True,vwc,False,gravity,pay annually,soft,enough,spring,communal standpipe,5
1,0.0,1399,Lake Victoria,Mara,Serengeti,280,True,wug,True,gravity,never pay,soft,insufficient,rainwater harvesting,communal standpipe,2
2,25.0,686,Pangani,Manyara,Simanjiro,250,True,vwc,True,gravity,pay per bucket,soft,enough,dam,communal standpipe multiple,4
3,0.0,263,Ruvuma / Southern Coast,Mtwara,Nanyumbu,58,True,vwc,True,submersible,never pay,soft,dry,machine dbh,communal standpipe multiple,63
4,0.0,0,Lake Victoria,Kagera,Karagwe,0,True,other,True,gravity,never pay,soft,seasonal,rainwater harvesting,communal standpipe,1


In [16]:
X_num = X_picked.select_dtypes(include='number')
X_num.head()

Unnamed: 0,amount_tsh,gps_height,population
0,6000.0,1390,109
1,0.0,1399,280
2,25.0,686,250
3,0.0,263,58
4,0.0,0,0


In [17]:
num_cols = list(X_num.columns)
num_cols

['amount_tsh', 'gps_height', 'population']

In [18]:
X_bool = X_picked.select_dtypes(include='boolean')

In [19]:
bool_cols = list(X_bool.columns)

In [20]:
X_cat = X_picked.select_dtypes(include='object')
X_cat.head()

Unnamed: 0,basin,region,lga,management,extraction_type,payment,water_quality,quantity,source,waterpoint_type,district_code
0,Lake Nyasa,Iringa,Ludewa,vwc,gravity,pay annually,soft,enough,spring,communal standpipe,5
1,Lake Victoria,Mara,Serengeti,wug,gravity,never pay,soft,insufficient,rainwater harvesting,communal standpipe,2
2,Pangani,Manyara,Simanjiro,vwc,gravity,pay per bucket,soft,enough,dam,communal standpipe multiple,4
3,Ruvuma / Southern Coast,Mtwara,Nanyumbu,vwc,submersible,never pay,soft,dry,machine dbh,communal standpipe multiple,63
4,Lake Victoria,Kagera,Karagwe,other,gravity,never pay,soft,seasonal,rainwater harvesting,communal standpipe,1


In [21]:
cat_cols = list(X_cat.columns)
cat_cols

['basin',
 'region',
 'lga',
 'management',
 'extraction_type',
 'payment',
 'water_quality',
 'quantity',
 'source',
 'waterpoint_type',
 'district_code']

In [22]:
#create subpipes for numeric and categorical features
#StandardScaler on numerical features
subpipe_num = Pipeline(steps=[('ss', StandardScaler() )])
#OHE categorical features, handle_unknown = ignore, drop first if binary
subpipe_cat = Pipeline(steps=[('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore') )])
#booleans
subpipe_bool = Pipeline(steps=[('ord', OrdinalEncoder())])



In [23]:
CT = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, num_cols),
                                     ('subpipe_cat', subpipe_cat, cat_cols),
                                    ('subpipe_bool', subpipe_bool, bool_cols)],
                       remainder='drop')


In [24]:
dtc = DecisionTreeClassifier(random_state=42)

first_model_pipe = Pipeline(steps = [('ct', CT),
                            ('dtc', dtc )])

In [25]:
first_model_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'population']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['basin', 'region', 'lga',
                                                   'management',
                                                   'extraction_type', 'payment',
                                      

In [33]:
first_model_pipe.score(X_train, y_train)

0.9584511784511784

In [34]:
first_model_pipe.score(X_test, y_test)

0.8115824915824916

In [35]:
cross_val_score(first_model_pipe, X_train, y_train).mean()

0.8126374859708193

# DummyClassifier Baseline

In [26]:
dummy_clf = DummyClassifier(strategy='most_frequent')

In [27]:
dummy_clf.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

In [28]:
dummy_clf.score(X_train, y_train)

0.6151290684624018

## Knn pipeline

In [33]:
knn = KNeighborsClassifier(n_neighbors=12, leaf_size=2, metric='manhattan', p=2, weights='distance')

knn_model_pipe = Pipeline(steps = [('ct', CT),
                            ('KN', knn )])
knn_model_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'population']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['basin', 'region', 'lga',
                                                   'management',
                                                   'extraction_type', 'payment',
                                      

In [35]:
cross_val_score(knn_model_pipe, X_train, y_train).mean()

0.8300785634118967

## Bagging

In [44]:
dt2 = DecisionTreeClassifier(criterion='gini', max_depth=50, min_samples_leaf=1,min_samples_split=10)
bagging_dt_model = BaggingClassifier(base_estimator=dt2, n_estimators=100, random_state=42)
bag_dt_pipe = Pipeline(steps = [('ct', CT),
                            ('baglogreg', bagging_dt_model )])


In [45]:
cross_val_score(bag_dt_pipe, X_train, y_train, verbose=3).mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] .................................... , score=0.842, total= 1.4min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min remaining:    0.0s


[CV] .................................... , score=0.843, total= 1.3min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.7min remaining:    0.0s


[CV] .................................... , score=0.840, total= 1.3min
[CV]  ................................................................
[CV] .................................... , score=0.844, total= 1.3min
[CV]  ................................................................
[CV] .................................... , score=0.837, total= 1.3min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.7min finished


0.8415488215488216

In [69]:
dtc_params = {
    'baglogreg__base_estimator':[dt2],
    'baglogreg__max_features': [0.5, 0.7, 1.0],
    'baglogreg__max_samples': [0.5, 0.7, 1.0],
    'baglogreg__n_estimators': [25, 50, 75, 100]
}

dtc_gs = GridSearchCV(bag_dt_pipe, dtc_params, cv=5, verbose=1, n_jobs=-1)
dtc_gs.fit(X_train, y_train)
dtc_best = dtc_gs.best_estimator_
dtc_gs.best_params_


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 29.5min finished


{'baglogreg__base_estimator': DecisionTreeClassifier(max_depth=50, min_samples_split=10),
 'baglogreg__max_features': 0.7,
 'baglogreg__max_samples': 1.0,
 'baglogreg__n_estimators': 100}

In [71]:
best_params = {'baglogreg__base_estimator': DecisionTreeClassifier(max_depth=50, min_samples_split=10),
 'baglogreg__max_features': 0.7,
 'baglogreg__max_samples': 1.0,
 'baglogreg__n_estimators': 100}


In [70]:
cross_val_score(dtc_best, X_train, y_train, n_jobs=-1).mean()

0.8466217732884399

In [78]:
dtc_best

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'population']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['basin', 'region', 'lga',
                                                   'management',
                                                   'extraction_type', 'payment',
                                      

In [91]:
feature_importances2 = np.mean([
    tree.feature_importances_ for tree in dtc_best.named_steps["baglogreg"].estimators_
], axis=0)
feature_importances

array([0.003789  , 0.00597056, 0.00326782, 0.0040489 , 0.00314392,
       0.00334968, 0.00666225, 0.00408372, 0.01023413, 0.01202843,
       0.00679633, 0.00425639, 0.00587863, 0.00823688, 0.00626697,
       0.00525997, 0.00371261, 0.00657522, 0.00589749, 0.01011355,
       0.00448579, 0.00479602, 0.00932415, 0.00666615, 0.00500648,
       0.00449671, 0.0057594 , 0.00519228, 0.00891687, 0.00282758,
       0.00625752, 0.00420159, 0.00332805, 0.00670713, 0.003653  ,
       0.00325947, 0.00362244, 0.00362665, 0.00811835, 0.00520889,
       0.00548981, 0.01135741, 0.00633406, 0.00496216, 0.00278949,
       0.00510991, 0.00451433, 0.00731533, 0.006153  , 0.00680372,
       0.00566614, 0.00414933, 0.00768801, 0.00246232, 0.00338458,
       0.00585217, 0.00314174, 0.00483826, 0.00855516, 0.00628121,
       0.00727594, 0.00971308, 0.01414909, 0.00756395, 0.0058914 ,
       0.00756868, 0.00612501, 0.0048322 , 0.00416025, 0.00492953,
       0.00399707, 0.00547831, 0.00302755, 0.00755225, 0.00969

In [92]:
dtc_best.named_steps['ct'].transformers_[1][1].named_steps['ohe'].get_feature_names()
num_names = dtc_best.named_steps['ct'].transformers_[0][2]
ohe_names = dtc_best.named_steps['ct'].transformers_[1][1].named_steps['ohe'].get_feature_names()
bool_names = dtc_best.named_steps['ct'].transformers_[2][2]
all_names = np.concatenate( (num_names, ohe_names, bool_names) )
feature_importances = {name:score for name, score in zip(all_names, feature_importances2)}
feature_importances
sorted(feature_importances, key=feature_importances.get, reverse=True)[:5]

['x2_Kibaha', 'x2_Mbeya Rural', 'x0_Rufiji', 'x2_Bukoba Urban', 'x2_Tarime']

## Gridsearch

In [94]:
grid = {'KN__n_neighbors':range(6, 10), 'KN__leaf_size':range(4), 'KN__metric':['minkowski', 'manhattan'], 'KN__weights':['uniform', 'distance'], 'KN__p':[2]}

knn_gs = GridSearchCV(estimator=knn_model_pipe, param_grid=grid, cv=5, n_jobs=-1, verbose=3)
knn_gs.fit(X_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 27.6min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed: 32.8min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         ['amount_tsh',
                                                                          'gps_height',
                                                                          'population']),
                                                                        ('subpipe_cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                            

In [95]:
knn_gs.best_params_

{'KN__leaf_size': 1,
 'KN__metric': 'manhattan',
 'KN__n_neighbors': 9,
 'KN__p': 2,
 'KN__weights': 'distance'}

In [97]:
knn_gs.best_score_

0.829023569023569

In [98]:
knn_gs.best_estimator_.score(X_test, y_test)

0.8263299663299664

## Ensemble

In [99]:
final = RandomForestClassifier(criterion='gini', max_depth=50, min_samples_leaf=1,min_samples_split=5)

In [100]:
SC =  StackingClassifier(estimators=[('KNN', knn_gs.best_estimator_), ('dtc_bag', dtc_best)], final_estimator=final, verbose=3)

In [101]:
SC.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   53.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.4min finished


StackingClassifier(estimators=[('KNN',
                                Pipeline(steps=[('ct',
                                                 ColumnTransformer(transformers=[('subpipe_num',
                                                                                  Pipeline(steps=[('ss',
                                                                                                   StandardScaler())]),
                                                                                  ['amount_tsh',
                                                                                   'gps_height',
                                                                                   'population']),
                                                                                 ('subpipe_cat',
                                                                                  Pipeline(steps=[('ohe',
                                                                                        

In [102]:
cross_val_score(SC, X_train, y_train).mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   29.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   59.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.5min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   41.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   29.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   57.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concur

0.8152413019079685

In [192]:
import math

def geodistance(x1, y1, x2, y2):
    
    dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
    return dist

In [158]:
zipped_coord = list(zip(df.longitude.tolist(), df.latitude.tolist()))

In [160]:
from sklearn.neighbors import NearestNeighbors

In [189]:
df1 = df[0:3].copy()

In [190]:
df1

Unnamed: 0.1,Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,...,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,id_label,status_group,date_recorded_datetime
0,0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,...,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,69572,functional,2011-03-14
1,1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,...,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,8776,functional,2013-03-06
2,2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,...,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,34310,functional,2013-02-25


In [193]:
geodistance(34.938093, 9.856322, 34.698766, 2.147466)

7.71257014500776

In [191]:
neigh = NearestNeighbors(n_neighbors=1, radius=10)
neigh.fit(df1[['longitude']], df1['latitude'])
neigh.kneighbors(return_distance=True)

# NearestNeighbors(n_neighbors=1)
# >>> print(neigh.kneighbors([[1., 1., 1.]]))
# (array([[0.5]]), array([[2]]))

(array([[0.23932665],
        [0.23932665],
        [2.52257171]]),
 array([[1],
        [0],
        [0]], dtype=int64))

In [198]:
df.longitude.value_counts()

0.000000     1812
39.088875       2
39.105307       2
37.543401       2
38.180538       2
             ... 
38.710520       1
40.117029       1
34.672962       1
39.433604       1
34.890838       1
Name: longitude, Length: 57516, dtype: int64

In [205]:
df.latitude.value_counts()

-7.065373     2
-6.990549     2
-6.975594     2
-6.983115     2
-6.980220     2
             ..
-10.860985    1
-3.149066     1
-3.305834     1
-9.076967     1
-2.598965     1
Name: latitude, Length: 57516, dtype: int64

In [202]:
# dropping longitude of 0
df = df.loc[df["longitude"] != 0]
df.longitude.value_counts()

39.105307    2
37.252194    2
37.250111    2
39.088875    2
37.318911    2
            ..
40.117029    1
34.672962    1
39.433604    1
35.901580    1
35.005922    1
Name: longitude, Length: 57515, dtype: int64

In [None]:

from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(=1)
neigh.fit(samples)
NearestNeighbors(n_neighbors=1)
print(neigh.kneighbors([[1., 1., 1.]]))
(array([[0.5]]), array([[2]]))

In [228]:
#creating coordinates column
df['coordinates'] = df[['latitude', 'longitude']].values.tolist()
npcoordinates = df['coordinates'].to_list()

In [229]:
type(npcoordinates[0][0])

float

In [226]:
samples = [[0., 0], [0., .5], [1., 1]]
type(samples[0][0])

float

In [230]:
from sklearn.neighbors import NearestNeighbors
near = NearestNeighbors(n_neighbors=1)
near.fit(npcoordinates)

NearestNeighbors(n_neighbors=1)

In [241]:
varlist = [df.coordinates[0]]
varlist

[[-9.85632177, 34.93809275]]

In [247]:
near.kneighbors(npcoordinates, return_distance=True)

(array([[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]]),
 array([[    0],
        [    1],
        [    2],
        ...,
        [57585],
        [57586],
        [57587]], dtype=int64))

In [257]:
lst_holder = []

for i in range(len(npcoordinates)):
    print(i)
    near = NearestNeighbors(n_neighbors=1)
    copy_coord = npcoordinates.copy() 
    del copy_coord[i] 

    near.fit(copy_coord)
    
#     print(npcoordinates[i])
    kneigh = near.kneighbors([npcoordinates[i]], return_distance=True)[0]
    lst_holder.append(kneigh)
print(kneigh)    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

KeyboardInterrupt: 

In [258]:
!ls

2.0-Microsoft-Movie-Analysis-AMA-Productions
collab
dsc-data-science-env-config
dsc-linear-regression-lab
dsc-phase-1-project-v2-4
dsc-sklearn-preprocessing-lab
P2-SeattleHousing
Real estate investment opportunities in King county.pptx
Tanzanian-Water-Well-Status-Classification
Topic-2
Untitled.ipynb
