In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 30)

df = pd.read_csv('data/train.csv')
# df['Date'] = pd.to_datetime(df['Date'])
# df.set_index('Date', inplace = True)
df.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0


In [2]:
spray = pd.read_csv('data/spray.csv')
# spray['Date'] = pd.to_datetime(spray['Date'])
# spray.set_index('Date', inplace = True)
spray.head()

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157
3,2011-08-29,6:57:28 PM,42.390637,-88.089158
4,2011-08-29,6:57:38 PM,42.39041,-88.088858


The function below calculates the distance between two points on Earth in kilometers, given their latitude and longitude, using Haversine formula.

In [3]:
from math import sin, cos, sqrt, asin, radians
def distance(lat1,lon1,lat2,lon2):
    # transforming longitude and latitude to radians
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    
    # latitude and longitude difference
    dlat = lat2-lat1
    dlon = lon2-lon1
    
    # radius of the Earth in kn
    R = 6373 
    
    # complicated multiplier calculated separately
    a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
    
    # Haversine formula
    return 2*asin(sqrt(a))*R

In [4]:
# list dates when the spray was sprayed
spray_dates = spray['Date'].unique()
spray_dates

array(['2011-08-29', '2011-09-07', '2013-07-17', '2013-07-25',
       '2013-08-08', '2013-08-15', '2013-08-16', '2013-08-22',
       '2013-08-29', '2013-09-05'], dtype=object)

The loop below calculates the distance between the spray and each observation. If the distance is shorter than two specified values (0.5 and 1 km), it will assign a value of 1 to a column named with the date of the spray, that will be added to the dataframe.

In [5]:
weather = pd.read_csv('data/weather.csv')
weather['Date'] = pd.to_datetime(weather['Date'])
weather.set_index('Date', inplace = True)

In [6]:
weather.dtypes

Station          int64
Tmax             int64
Tmin             int64
Tavg            object
Depart          object
DewPoint         int64
WetBulb         object
Heat            object
Cool            object
Sunrise         object
Sunset          object
CodeSum         object
Depth           object
Water1          object
SnowFall        object
PrecipTotal     object
StnPressure     object
SeaLevel        object
ResultSpeed    float64
ResultDir        int64
AvgSpeed        object
dtype: object

In [7]:
for col in list(weather.columns):
    if weather[col].dtype == 'object':
        weather[col] = pd.to_numeric(weather[col], errors='coerce')

weather.dtypes    

Station          int64
Tmax             int64
Tmin             int64
Tavg           float64
Depart         float64
DewPoint         int64
WetBulb        float64
Heat           float64
Cool           float64
Sunrise        float64
Sunset         float64
CodeSum        float64
Depth          float64
Water1         float64
SnowFall       float64
PrecipTotal    float64
StnPressure    float64
SeaLevel       float64
ResultSpeed    float64
ResultDir        int64
AvgSpeed       float64
dtype: object

In [8]:
weather.isnull().sum()

Station           0
Tmax              0
Tmin              0
Tavg             11
Depart         1472
DewPoint          0
WetBulb           4
Heat             11
Cool             11
Sunrise        1472
Sunset         1472
CodeSum        2944
Depth          1472
Water1         2944
SnowFall       1484
PrecipTotal     320
StnPressure       4
SeaLevel          9
ResultSpeed       0
ResultDir         0
AvgSpeed          3
dtype: int64

In [9]:
weather.drop(columns = ['Water1','CodeSum'], inplace = True)
weather['Sunrise'] = weather['Sunrise'].fillna(method = 'pad', limit = 1)
weather['Sunset'] = weather['Sunset'].fillna(method = 'pad', limit = 1)
weather['Depart'] = weather['Depart'].fillna(method = 'pad', limit = 1)
weather['Heat'] = weather['Heat'].fillna(method = 'pad', limit = 1)
weather['Cool'] = weather['Cool'].fillna(method = 'pad', limit = 1)
weather['SnowFall'] = weather['SnowFall'].fillna(method = 'pad', limit = 1)
weather['Depth'] = weather['Depth'].fillna(method = 'pad', limit = 1)
weather['PrecipTotal'] = weather['PrecipTotal'].fillna(method = 'pad', limit = 1)
weather['AvgSpeed'] = weather['AvgSpeed'].fillna(method = 'pad', limit = 1)
weather['SeaLevel'] = weather['SeaLevel'].fillna(method = 'pad', limit = 1)
weather['StnPressure'] = weather['StnPressure'].fillna(method = 'pad', limit = 1)


for i in range(len(weather.index)):
    if np.isnan(weather.iloc[i,3]):
        weather.iloc[i,3] = (weather.iloc[i,1]+weather.iloc[i,2])/2
weather.head()

Unnamed: 0_level_0,Station,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,Depth,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2007-05-01,1,83,50,67.0,14.0,51,56.0,0.0,2.0,448.0,1849.0,0.0,0.0,0.0,29.1,29.82,1.7,27,9.2
2007-05-01,2,84,52,68.0,14.0,51,57.0,0.0,3.0,448.0,1849.0,0.0,0.0,0.0,29.18,29.82,2.7,25,9.6
2007-05-02,1,59,42,51.0,-3.0,42,47.0,14.0,0.0,447.0,1850.0,0.0,0.0,0.0,29.38,30.09,13.0,4,13.4
2007-05-02,2,60,43,52.0,-3.0,42,47.0,13.0,0.0,447.0,1850.0,0.0,0.0,0.0,29.44,30.08,13.3,2,13.4
2007-05-03,1,66,46,56.0,2.0,40,48.0,9.0,0.0,446.0,1851.0,0.0,0.0,0.0,29.39,30.12,11.7,7,11.9


In [10]:
weather['PrecipTotal'].fillna(0, inplace = True)
weather.isnull().sum()

Station         0
Tmax            0
Tmin            0
Tavg            0
Depart          0
DewPoint        0
WetBulb         4
Heat            0
Cool            0
Sunrise         0
Sunset          0
Depth           0
SnowFall       24
PrecipTotal     0
StnPressure     1
SeaLevel        0
ResultSpeed     0
ResultDir       0
AvgSpeed        0
dtype: int64

In [11]:
weather[['SnowFall','Tavg']][weather['SnowFall'].isnull()]

Unnamed: 0_level_0,SnowFall,Tavg
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2008-08-04,,79.0
2008-08-04,,79.0
2008-10-26,,47.0
2008-10-26,,46.0
2008-10-27,,40.0
2008-10-27,,41.0
2009-10-16,,42.0
2009-10-16,,43.0
2011-05-22,,74.0
2011-05-22,,77.0


In [12]:
# wherever SnowFall is missing, we see that temperatures are warm, so I can replace 
# the missing values with 0s
weather['SnowFall'].fillna(0,inplace = True)

In [13]:
weather[weather['WetBulb'].isnull()]

Unnamed: 0_level_0,Station,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,Depth,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2009-06-26,1,86,69,78.0,7.0,60,,0.0,13.0,418.0,1931.0,0.0,0.0,0.0,29.15,29.85,6.4,4,8.2
2013-08-10,1,81,64,73.0,0.0,57,,0.0,8.0,454.0,1900.0,0.0,0.0,0.0,29.34,30.08,5.3,5,6.5
2013-08-11,1,81,60,71.0,-2.0,61,,0.0,6.0,455.0,1859.0,0.0,0.0,0.01,29.35,30.07,2.0,27,3.0
2013-08-12,2,85,69,77.0,2.0,63,,0.0,12.0,456.0,1858.0,0.0,0.0,0.66,29.27,29.92,4.5,26,7.7


In [14]:
weather['WetBulb'].min()

32.0

In [15]:
weather['WetBulb'].max()

78.0

In [16]:
weather[weather['WetBulb'].isnull()].index.tolist()[0].date()

datetime.date(2009, 6, 26)

In [17]:
weather[weather["WetBulb"].isnull()].index

DatetimeIndex(['2009-06-26', '2013-08-10', '2013-08-11', '2013-08-12'], dtype='datetime64[ns]', name='Date', freq=None)

In [18]:
for index, row in weather.iterrows():
    if row['WetBulb'] == 'NaN':
        print(row['WetBulb'])

In [19]:
# creating pivot table that calculates the average values for each date
weather_mean = pd.pivot_table(weather, index = 'Date')

In [20]:
weather_mean

Unnamed: 0_level_0,AvgSpeed,Cool,Depart,Depth,DewPoint,Heat,PrecipTotal,ResultDir,ResultSpeed,SeaLevel,SnowFall,Station,StnPressure,Sunrise,Sunset,Tavg,Tmax,Tmin,WetBulb
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2007-05-01,9.40,2.5,14.0,0.0,51.0,0.0,0.000,26.0,2.20,29.820,0.0,1.5,29.140,448.0,1849.0,67.50,83.5,51.0,56.5
2007-05-02,13.40,0.0,-3.0,0.0,42.0,13.5,0.000,3.0,13.15,30.085,0.0,1.5,29.410,447.0,1850.0,51.50,59.5,42.5,47.0
2007-05-03,12.55,0.0,2.0,0.0,40.0,8.0,0.000,6.5,12.30,30.120,0.0,1.5,29.425,446.0,1851.0,57.00,66.5,47.0,49.0
2007-05-04,10.60,0.0,4.0,0.0,41.5,7.0,0.000,7.5,10.25,30.045,0.0,1.5,29.335,444.0,1852.0,61.25,72.0,50.0,50.0
2007-05-05,11.75,0.0,5.0,0.0,38.5,5.0,0.000,7.0,11.45,30.095,0.0,1.5,29.430,443.0,1853.0,60.00,66.0,53.5,49.5
2007-05-06,14.75,0.0,4.0,0.0,30.0,5.5,0.000,10.5,14.10,30.285,0.0,1.5,29.595,442.0,1855.0,59.50,68.0,50.5,46.0
2007-05-07,10.20,1.0,10.0,0.0,40.0,0.0,0.000,17.5,8.55,30.120,0.0,1.5,29.410,441.0,1856.0,66.00,83.5,48.5,53.5
2007-05-08,5.60,4.0,12.0,0.0,57.5,0.0,0.000,9.5,2.60,30.025,0.0,1.5,29.325,439.0,1857.0,69.00,81.0,57.0,62.5
2007-05-09,6.05,4.5,13.0,0.0,59.5,0.0,0.075,8.0,3.90,29.935,0.0,1.5,29.245,438.0,1858.0,69.50,76.5,62.0,63.0
2007-05-10,4.00,5.5,14.0,0.0,52.0,0.0,0.000,13.0,1.35,29.915,0.0,1.5,29.230,437.0,1859.0,70.50,83.5,57.5,60.5


In [21]:
df['Date'] = pd.to_datetime(df['Date'])
df = pd.concat([df, pd.get_dummies(df['Species'], drop_first=True)], axis=1)
df.drop(['Species','AddressNumberAndStreet','Address','Block','Street','Trap','AddressAccuracy'],
        axis = 1, inplace = True)

df.set_index('Date', inplace = True)

#,pd.get_dummies(df['Trap'], drop_first=True)

df.head()

Unnamed: 0_level_0,Latitude,Longitude,NumMosquitos,WnvPresent,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2007-05-29,41.95469,-87.800991,1,0,0,1,0,0,0,0
2007-05-29,41.95469,-87.800991,1,0,0,0,1,0,0,0
2007-05-29,41.994991,-87.769279,1,0,0,0,1,0,0,0
2007-05-29,41.974089,-87.824812,1,0,0,1,0,0,0,0
2007-05-29,41.974089,-87.824812,4,0,0,0,1,0,0,0


In [22]:
df_temp = pd.merge(df, weather_mean, how = 'left', left_on = 'Date', right_index = True)
df_temp.drop(columns = ['Sunrise','Sunset','Station','Depart'], inplace = True)

In [23]:
df_temp

Unnamed: 0_level_0,Latitude,Longitude,NumMosquitos,WnvPresent,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,AvgSpeed,Cool,Depth,DewPoint,Heat,PrecipTotal,ResultDir,ResultSpeed,SeaLevel,SnowFall,StnPressure,Tavg,Tmax,Tmin,WetBulb
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
2007-05-29,41.954690,-87.800991,1,0,0,1,0,0,0,0,6.95,10.5,0.0,58.5,0.0,0.000,17.0,5.80,30.100,0.0,29.415,75.5,88.0,62.5,65.5
2007-05-29,41.954690,-87.800991,1,0,0,0,1,0,0,0,6.95,10.5,0.0,58.5,0.0,0.000,17.0,5.80,30.100,0.0,29.415,75.5,88.0,62.5,65.5
2007-05-29,41.994991,-87.769279,1,0,0,0,1,0,0,0,6.95,10.5,0.0,58.5,0.0,0.000,17.0,5.80,30.100,0.0,29.415,75.5,88.0,62.5,65.5
2007-05-29,41.974089,-87.824812,1,0,0,1,0,0,0,0,6.95,10.5,0.0,58.5,0.0,0.000,17.0,5.80,30.100,0.0,29.415,75.5,88.0,62.5,65.5
2007-05-29,41.974089,-87.824812,4,0,0,0,1,0,0,0,6.95,10.5,0.0,58.5,0.0,0.000,17.0,5.80,30.100,0.0,29.415,75.5,88.0,62.5,65.5
2007-05-29,41.921600,-87.666455,2,0,0,0,1,0,0,0,6.95,10.5,0.0,58.5,0.0,0.000,17.0,5.80,30.100,0.0,29.415,75.5,88.0,62.5,65.5
2007-05-29,41.891118,-87.654491,1,0,0,0,1,0,0,0,6.95,10.5,0.0,58.5,0.0,0.000,17.0,5.80,30.100,0.0,29.415,75.5,88.0,62.5,65.5
2007-05-29,41.867108,-87.654224,1,0,0,1,0,0,0,0,6.95,10.5,0.0,58.5,0.0,0.000,17.0,5.80,30.100,0.0,29.415,75.5,88.0,62.5,65.5
2007-05-29,41.867108,-87.654224,2,0,0,0,1,0,0,0,6.95,10.5,0.0,58.5,0.0,0.000,17.0,5.80,30.100,0.0,29.415,75.5,88.0,62.5,65.5
2007-05-29,41.896282,-87.655232,1,0,0,0,1,0,0,0,6.95,10.5,0.0,58.5,0.0,0.000,17.0,5.80,30.100,0.0,29.415,75.5,88.0,62.5,65.5


In [24]:
df.shape

(10506, 10)

In [25]:
df_temp.shape

(10506, 25)

In [26]:
df_temp.to_csv('train_weather.csv')

In [27]:
df_temp.isnull().sum()

Latitude                  0
Longitude                 0
NumMosquitos              0
WnvPresent                0
CULEX PIPIENS             0
CULEX PIPIENS/RESTUANS    0
CULEX RESTUANS            0
CULEX SALINARIUS          0
CULEX TARSALIS            0
CULEX TERRITANS           0
AvgSpeed                  0
Cool                      0
Depth                     0
DewPoint                  0
Heat                      0
PrecipTotal               0
ResultDir                 0
ResultSpeed               0
SeaLevel                  0
SnowFall                  0
StnPressure               0
Tavg                      0
Tmax                      0
Tmin                      0
WetBulb                   0
dtype: int64

In [28]:
df_temp['Year'] = [df_temp.index[i].year for i in range(len(df_temp.index))]
df_temp['Month'] = [df_temp.index[i].month for i in range(len(df_temp.index))]
df_temp['Day'] = [df_temp.index[i].day for i in range(len(df_temp.index))]

In [29]:
features = [feat for feat in df_temp.columns if feat != "WnvPresent"]

In [30]:
X = df_temp[features]
y = df_temp['WnvPresent']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 17)

In [31]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

k = np.arange(3,30,2)
parameters = {'n_neighbors': k,
              'p': [1,2] }
knn = KNeighborsClassifier()
clf = GridSearchCV(knn,parameters,cv=10)
clf.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': array([ 3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]), 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [32]:
clf.best_params_

{'n_neighbors': 21, 'p': 1}

In [33]:
clf.score(X_train, y_train)

0.9483579247977154

In [34]:
clf.score(X_test, y_test)

0.9448144624167459

In [35]:
from sklearn.metrics import roc_auc_score
y_pred = clf.predict(X_test)
auc_knn = roc_auc_score(y_test, y_pred)
auc_knn

0.5077658936072575

In [36]:
list(X.columns)

['Latitude',
 'Longitude',
 'NumMosquitos',
 'CULEX PIPIENS',
 'CULEX PIPIENS/RESTUANS',
 'CULEX RESTUANS',
 'CULEX SALINARIUS',
 'CULEX TARSALIS',
 'CULEX TERRITANS',
 'AvgSpeed',
 'Cool',
 'Depth',
 'DewPoint',
 'Heat',
 'PrecipTotal',
 'ResultDir',
 'ResultSpeed',
 'SeaLevel',
 'SnowFall',
 'StnPressure',
 'Tavg',
 'Tmax',
 'Tmin',
 'WetBulb',
 'Year',
 'Month',
 'Day']

In [37]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
auc_logreg = roc_auc_score(y_test, y_pred)
auc_logreg

0.5

In [38]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
auc_rfc = roc_auc_score(y_test, y_pred)
auc_rfc

0.5303090472660524

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV


params = {'n_estimators': [30,35,40,45],
          'max_depth': [50]}
rfc = RandomForestClassifier() 
rfcl = GridSearchCV(rfc,params,cv=10)
rfcl.fit(X_train, y_train)
rfcl.best_params_

{'max_depth': 50, 'n_estimators': 35}

In [40]:
from sklearn.metrics import roc_auc_score

y_pred = rfcl.predict(X_test)
auc_rfcl = roc_auc_score(y_test, y_pred)
auc_rfcl

0.551735853718804

## Best model so far

In [None]:
params = {'n_estimators': [30,35,40,45],
          'max_depth': [50]}
rfc = RandomForestClassifier() 
rfcl = GridSearchCV(rfc,params,cv=10)
rfcl.fit(X_train, y_train)

In [None]:
rfcl.best_params_

In [None]:
y_pred = rfcl.predict(X_test)
auc_rfcl = roc_auc_score(y_test, y_pred)
auc_rfcl