In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn import preprocessing

In [None]:
df = pd.read_csv('diabetes.csv')
X = df.drop('Outcome',axis=1)
y = df['Outcome']
X_col=X.columns
df

In [None]:
# Split data into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1,stratify=y)

In [None]:
#Fit 100 trees and bag them
tree = DecisionTreeClassifier()
bag = BaggingClassifier(tree, n_estimators=100, random_state=0,oob_score=True)
bag.fit(X_train, y_train)

In [None]:
y_pred=bag.predict(X_test)
print('Test Accuracy:', accuracy_score(y_test, y_pred),'Oob Accuracy:',bag.oob_score_)

In [None]:
#Fit random forests
clf=RandomForestClassifier(random_state=0,n_estimators=100,oob_score=True)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print('Test Accuracy:', accuracy_score(y_test, y_pred),'Oob Accuracy:',clf.oob_score_)

In [None]:
#Let's try different numbers of trees
Oob_Accuracy=[]
for i in np.linspace(start = 50, stop = 500, num = 10):
    clf=RandomForestClassifier(random_state=0,n_estimators=int(i),oob_score=True)
    clf.fit(X_train,y_train)
    Oob_Accuracy.append([i,np.array(clf.oob_score_)])
df = pd.DataFrame(Oob_Accuracy,columns=['Number_of_Trees','Oob Accuracy'])
df

In [None]:
fig=plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(df['Number_of_Trees'].values,df['Oob Accuracy'].values,label = 'Oob Accuracy')
ax.set_xlabel('Number_of_Trees')
ax.set_ylabel('Accuracy')
ax.tick_params(axis='x', labelsize=8)
ax.legend(loc='best')

In [None]:
#Although we picked 350, we could have picked a bigger number. Higher Number of trees do not lead to overfitting
clf=RandomForestClassifier(random_state=0,n_estimators=350,oob_score=True)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
#Let's try different numbers of features at each split (default is sqrt(p) where p is the number of features)
Oob_Accuracy=[]
for i in range(1,9):
    clf=RandomForestClassifier(random_state=0,n_estimators=350,max_features=i,oob_score=True)
    clf.fit(X_train,y_train)
    Oob_Accuracy.append([i,np.array(clf.oob_score_)])
df = pd.DataFrame(Oob_Accuracy,columns=['Number_of_Features','Oob Accuracy'])
df

In [None]:
clf=RandomForestClassifier(random_state=0,n_estimators=350,max_features=3,oob_score=True)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print('Test Accuracy:', accuracy_score(y_test, y_pred))

In [None]:
# Split data into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10,stratify=y)

In [None]:
Oob_Accuracy=[]
for i in range(1,9):
    clf=RandomForestClassifier(random_state=0,n_estimators=350,max_features=i,oob_score=True)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    Oob_Accuracy.append([i,np.array(clf.oob_score_)])
df = pd.DataFrame(Oob_Accuracy,columns=['Number_of_Features','Oob Accuracy'])
df

In [None]:
clf=RandomForestClassifier(random_state=0,n_estimators=350,max_features=6,oob_score=True)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print('Test Accuracy:', accuracy_score(y_test, y_pred))

In [None]:
clf=RandomForestClassifier(random_state=0)
# number of trees in random forest
n_estimators = [100,200,300,400,500,600]
# number of features at every split
max_features = [1,2,3,4,5,6,7,8]
# create grid
params = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 }
params

In [None]:
# Grid search of parameters
clf_grid = GridSearchCV(estimator = clf, param_grid = params, 
                                cv = 5, verbose=2, scoring='accuracy',n_jobs = -1)
# Fit the model
clf_grid.fit(X_train, y_train)
# print results
print(clf_grid.best_params_)

In [None]:
clf=RandomForestClassifier(random_state=0,n_estimators=300,max_features=4,oob_score=True)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print('Test Accuracy:', accuracy_score(y_test, y_pred))

In [None]:
# Split data into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1,stratify=y)

In [None]:
# Grid search of parameters
clf_grid = GridSearchCV(estimator = clf, param_grid = params, 
                                cv = 5, verbose=2, scoring='accuracy',n_jobs = -1)
# Fit the model
clf_grid.fit(X_train, y_train)
# print results
print(clf_grid.best_params_)
#The results are different :) This will always happen because there is variability due to a change in training and test sets

In [None]:
clf=RandomForestClassifier(random_state=0,n_estimators=100,max_features=6,oob_score=True)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print('Test Accuracy:', accuracy_score(y_test, y_pred))

In [None]:
clf=RandomForestClassifier(random_state=0)
# number of trees in random forest
n_estimators = [100,200,300,400,500,600]
# number of features at every split
max_features = [1,2,3,4,5,6,7,8]
# create grid
params = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 }
#Instead of one test set we will use cross validation, please note that both parameter selection and 
#the test performance is computed via cross validation (a nested cross-validation) 
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, random_state=1, shuffle=True)
CVErrors=[]
for train_index, validation_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[validation_index], 
    y_train, y_test = y.iloc[train_index], y.iloc[validation_index]
    # Grid search of parameters
    clf_grid = GridSearchCV(estimator = clf, param_grid = params, 
                                cv = 5, verbose=2, scoring='accuracy',n_jobs = -1)
    # Fit the model
    clf_grid.fit(X_train, y_train)
    # print results
    print(clf_grid.best_params_)
    #After finding best parameters fit the model
    clf=RandomForestClassifier(**clf_grid.best_params_)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    #Test the performance on the test set
    CVErrors.append(accuracy_score(y_test, y_pred))

In [None]:
CVErrors

In [None]:
np.mean(CVErrors)

In [None]:
# Split data into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1,stratify=y)

In [None]:
GridSearchCV()
# Grid search of parameters
clf_grid = GridSearchCV(estimator = clf, param_grid = params, 
                                cv = 5, verbose=2, scoring='roc_auc',n_jobs = -1)
# Fit the model
clf_grid.fit(X_train, y_train)
# print results
print(clf_grid.best_params_)

In [None]:
clf=RandomForestClassifier(random_state=0,n_estimators=400,max_features=2,oob_score=True)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
clf=RandomForestClassifier(random_state=0,n_estimators=400,max_features=2,oob_score=True)
clf.fit(X_train,y_train)
y_pred_proba=np.array(clf.predict_proba(X_test))
y_pred_proba

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:,1])
auc = roc_auc_score(y_test, y_pred_proba[:,1])
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
# Plot the impurity-based feature importances of the forest
feats = {} # a dict to hold feature_name: feature_importance
importances = clf.feature_importances_
clf.feature_importances_

In [None]:
for feature, importance in zip(X_col, clf.feature_importances_):
    feats[feature] = importance #add the name/value pair 
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances = importances.sort_values(by='Gini-importance',ascending=False)
importances.plot.barh(color='r')

In [None]:
clf=LogisticRegression(solver='liblinear')
C_param_range = [0.001,0.01,0.1,1,10,100,1000,10000]
penalties=['l1','l2']
# create grid
params = {
 'C': C_param_range,
 'penalty': penalties,
 }

clf_grid = GridSearchCV(estimator = clf, param_grid = params, 
                                cv = 5, verbose=2, scoring='roc_auc',n_jobs = -1)
# Fit the model
clf_grid.fit(X_train, y_train)
# print results
print(clf_grid.best_params_)

In [None]:
clf=LogisticRegression(C=100,penalty='l2',solver='liblinear')
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print('Test Accuracy:', accuracy_score(y_test, y_pred))

In [None]:
df = pd.read_csv('Hitters_Data.csv')
df=df.dropna()
df

In [None]:
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = np.log(df.Salary)

# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis = 1).astype('float64')

# Define the feature set X.
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis = 1)

In [None]:
# Split data into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
X_test

In [None]:
regressor=RandomForestRegressor(random_state=0)
RandomForestRegressor()
# number of trees in random forest
n_estimators = [100,200,300,400,500,600]
# number of features at every split
max_features = [3,4,5,6,7]
# create grid
params = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 }
# Random search of parameters
clf_grid = GridSearchCV(estimator = regressor, param_grid = params, 
                                cv = 5, verbose=2, scoring='neg_mean_squared_error',n_jobs = -1)
# Fit the model
clf_grid.fit(X_train, y_train)
# print results
print(clf_grid.best_params_)

In [None]:
clf=RandomForestRegressor(random_state=0,n_estimators=300,max_features=4)
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))

In [None]:
from sklearn.pipeline import Pipeline
# Create a pipeline
pipe = Pipeline([('Regressor', RandomForestRegressor())])

# Create space of candidate learning algorithms and their hyperparameters
search_space = [{'Regressor': [Ridge()],
                 'Regressor__alpha': np.logspace(-3, 1, 10)},
                {'Regressor': [Lasso(max_iter = 10000)],
                 'Regressor__alpha': np.logspace(-3, 1, 10)},
                {'Regressor': [KNeighborsRegressor()],
                 'Regressor__n_neighbors':[2,3,4,5,6]},
                {'Regressor': [RandomForestRegressor(random_state=0)],
                 'Regressor__n_estimators': [100, 200,300,400,500],
                 'Regressor__max_features': [3,4,5,6,7]}]

# Create grid search 
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0)
# Fit grid search
best_model = clf.fit(X_train, y_train)
# View best model
best_model.best_estimator_.get_params()['Regressor']

In [None]:
y_pred=best_model.predict(X_test)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))