DATASET to UPLOAD: Headbrain1, 7282_1

#**Train-Test Split**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# importing data
df = pd.read_csv('headbrain1.csv')
df.head()

In [None]:
# head of the data
print(df.head())
print("Main Data Size: ", len(df))
print('')

X= df['Head Size(cm^3)']
y=df['Brain Weight(grams)']

# using the train test split function
X_train, X_test, y_train, y_test = train_test_split(X,y ,
                                   random_state=104,
                                   test_size=0.25,
                                   shuffle=True)

# printing out train and test sets

print('X_train : ')
print(X_train.head())
print(X_train.size)
print('')
print('X_test : ')
print(X_test.head())
print(X_test.size)
print('')
print('y_train : ')
print(y_train.head())
print(y_train.size)
print('')
print('y_test : ')
print(y_test.head())
print(y_test.size)

#**Evaluation Method**

##**Simple Evaluation Metrics**

In [81]:
import numpy as np

real = [0,0,0,1,1,1,1,0,0,1,0,1,1,0,1]
pred = [1,0,0,0,1,1,0,1,0,1,1,0,1,0,1]
eval_test = pd.DataFrame({'real': real, 'pred': pred})

In [82]:
def status_teller(input):
  metrics = ''
  if   input['real'] == 1 and input['pred'] == 1:
    metrics = 'TP'
  elif input['real'] == 0 and input['pred'] == 1:
    metrics = 'FP'
  elif input['real'] == 1 and input['pred'] == 0:
    metrics = 'FN'
  else:
    metrics = 'TN'
  return metrics

eval_test['status'] = eval_test.apply(status_teller, axis = 1)

In [83]:
eval_test

Unnamed: 0,real,pred,status
0,0,1,FP
1,0,0,TN
2,0,0,TN
3,1,0,FN
4,1,1,TP
5,1,1,TP
6,1,0,FN
7,0,1,FP
8,0,0,TN
9,1,1,TP


In [84]:
aggregate = eval_test[['real', 'pred']].groupby(['real'])
aggregate_status = aggregate['pred'].count()

agg = aggregate_status.reset_index

In [85]:
TP = eval_test[eval_test['status'] == 'TP']
TP_val = len(TP)

FP = eval_test[eval_test['status'] == 'FP']
FP_val = len(FP)

FN = eval_test[eval_test['status'] == 'FN']
FN_val = len(FN)

TN = eval_test[eval_test['status'] == 'TN']
TN_val = len(TN)

In [86]:
print('TP = ', TP_val)
print('FP = ', FP_val)
print('FN = ', FN_val)
print('TN = ', TN_val)

TP =  5
FP =  3
FN =  3
TN =  4


In [87]:
Accuracy = (TP_val + TN_val)/(TP_val + TN_val + FP_val + FN_val)
Precision = TP_val/(TP_val+FP_val)
Recall = TP_val/(TP_val+FN_val)

print("Accuracy  = ", Accuracy*100,"%")
print("Precision = ", Precision*100, "%")
print("Recall    = ", Recall*100, "%")

Accuracy  =  60.0 %
Precision =  62.5 %
Recall    =  62.5 %


##**AUROC**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.datasets import load_iris
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


# Load the iris dataset
iris = load_iris()

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    test_size=0.5,
                                                    random_state=23)

# Train a Random Forest classifier
clf = OneVsRestClassifier(RandomForestClassifier())

# fit model
clf.fit(X_train, y_train)

# Get predicted class probabilities for the test set
y_pred_prob = clf.predict_proba(X_test)

# Compute the ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
print('ROC AUC Score :',roc_auc)

# roc curve for Multi classes
colors = ['orange','red','green']
for i in range(len(iris.target_names)):
    fpr, tpr, thresh = roc_curve(y_test, y_pred_prob[:,i], pos_label=i)
    plt.plot(fpr, tpr, linestyle='--',color=colors[i], label=iris.target_names[i]+' vs Rest')
# roc curve for tpr = fpr
plt.plot([0, 1], [0, 1], 'k--', label='Random classifier')
plt.title('Multiclass (Iris) ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend()
plt.show()

##**Root Mean Square Error (RMSE)**

In [None]:
import math
y_actual = [1,2,3,4,5]
y_predicted = [1.6,2.5,2.9,3,4.1]

MSE = np.square(np.subtract(y_actual,y_predicted)).mean()

RMSE = math.sqrt(MSE)
print("Root Mean Square Error:\n")
print(RMSE)

#**Resampling Technique**

##**K-Fold Cross Validation**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
digits = load_digits()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data,digits.target,test_size=0.3)

In [None]:
rf = RandomForestClassifier(n_estimators=40)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
from sklearn.model_selection import cross_val_score
score_rf=cross_val_score(RandomForestClassifier(n_estimators=40),digits.data, digits.target,cv=3)
print(score_rf)
print('Avg :',np.average(score_rf))

In [None]:
scores1 = cross_val_score(RandomForestClassifier(n_estimators=5),digits.data, digits.target, cv=10)
print('Avg Score for Estimators=5 and CV=10 :',np.average(scores1))

scores2 = cross_val_score(RandomForestClassifier(n_estimators=20),digits.data, digits.target, cv=10)
print('Avg Score for Estimators=20 and CV=10 :',np.average(scores2))

scores3 = cross_val_score(RandomForestClassifier(n_estimators=30),digits.data, digits.target, cv=10)
print('Avg Score for Estimators=30 and CV=10 :',np.average(scores3))

scores4 = cross_val_score(RandomForestClassifier(n_estimators=40),digits.data, digits.target, cv=10)
print('Avg Score for Estimators=40 and CV=10 :',np.average(scores4))

##**Bootstrapping Technique**

In [None]:
import pandas as pd
import numpy as np


#moview reviews
df = pd.read_csv('7282_1.csv')

#sample
s=df.sample(500,random_state=7)

s.head()

In [None]:
s['reviews.rating'].hist()

In [None]:
s['reviews.rating'].describe()

In [None]:
bootstrap = pd.DataFrame({'mean_rating':[s.sample(500,replace=True)['reviews.rating'].mean() for i in range(0,1000)]})

bootstrap

In [None]:
bootstrap['mean_rating'].hist()

In [None]:
(bootstrap['mean_rating'].quantile(0.025),bootstrap['mean_rating'].quantile(0.975))

In [None]:
df['reviews.rating'].mean()

##**Undersampling and Oversampling Technique**

In [1]:
# Creating the modeling dataset
from sklearn.datasets import make_classification
# Data processing
import pandas as pd
import numpy as np
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Model and performance
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Oversampling and under sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from collections import Counter

In [2]:
# Create an imbalanced dataset
X, y = make_classification(n_samples=100000, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=[0.995, 0.005], class_sep=0.5, random_state=0)
# Convert the data from numpy array to a pandas dataframe
df = pd.DataFrame({'feature1': X[:, 0], 'feature2': X[:, 1], 'target': y})
# Check the target distribution
df['target'].value_counts(normalize = True)

0    0.9897
1    0.0103
Name: target, dtype: float64

In [None]:
# Visualize the data
plt.figure(figsize=(12, 8))
sns.scatterplot(x = 'feature1', y = 'feature2', hue = 'target', data = df)

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the number of records
print('The number of records in the training dataset is', X_train.shape[0])
print('The number of records in the test dataset is', X_test.shape[0])
print(f"The training dataset has {sorted(Counter(y_train).items())[0][1]} records for the majority class and {sorted(Counter(y_train).items())[1][1]} records for the minority class.")

In [None]:
# Train the random forest model
rf = RandomForestClassifier()
baseline_model = rf.fit(X_train, y_train)
baseline_prediction = baseline_model.predict(X_test)
# Check the model performance
print(classification_report(y_test, baseline_prediction))

###Undersampling

Random Undersampling

In [None]:
# Randomly under sample the majority class
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus= rus.fit_resample(X_train, y_train)
# Check the number of records after under sampling
print(sorted(Counter(y_train_rus).items()))

In [None]:
# Convert the data from numpy array to a pandas dataframe
df_rus = pd.DataFrame({'feature1': X_train_rus[:, 0], 'feature2': X_train_rus[:, 1], 'target': y_train_rus})
# Plot the chart
plt.figure(figsize=(12, 8))
sns.scatterplot(x = 'feature1', y = 'feature2', hue = 'target', data = df_rus)
plt.title('Random Under Sampling')

In [None]:
# Train the random forest model
# rf = RandomForestClassifier()
rus_model = rf.fit(X_train_rus, y_train_rus)
rus_prediction = rus_model.predict(X_test)
# Check the model performance
print(classification_report(y_test, rus_prediction))

Near-Miss Undersampling

In [None]:
# Under sample the majority class
nearmiss = NearMiss(version=3)
X_train_nearmiss, y_train_nearmiss= nearmiss.fit_resample(X_train, y_train)
# Check the number of records after over sampling
print(sorted(Counter(y_train_nearmiss).items()))

In [None]:
# Convert the data from numpy array to a pandas dataframe
df_nearmiss = pd.DataFrame({'feature1': X_train_nearmiss[:, 0], 'feature2': X_train_nearmiss[:, 1], 'target': y_train_nearmiss})
# Plot the chart
plt.figure(figsize=(12, 8))
sns.scatterplot(x = 'feature1', y = 'feature2', hue = 'target', data = df_nearmiss)
plt.title('NearMiss Under Sampling')

In [None]:
# Train the random forest model
# rf = RandomForestClassifier()
nearmiss_model = rf.fit(X_train_nearmiss, y_train_nearmiss)
nearmiss_prediction = nearmiss_model.predict(X_test)
# Check the model performance
print(classification_report(y_test, nearmiss_prediction))

###Oversampling

Random Oversampling

In [None]:
# Randomly over sample the minority class
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros= ros.fit_resample(X_train, y_train)
# Check the number of records after over sampling
print(sorted(Counter(y_train_ros).items()))

In [None]:
# Convert the data from numpy array to a pandas dataframe
df_ros = pd.DataFrame({'feature1': X_train_ros[:, 0], 'feature2': X_train_ros[:, 1], 'target': y_train_ros})
# Plot the chart
plt.figure(figsize=(12, 8))
sns.scatterplot(x = 'feature1', y = 'feature2', hue = 'target', data = df_ros)
plt.title('Random Over Sampling')

In [None]:
# Train the random forest model
# rf = RandomForestClassifier()
ros_model = rf.fit(X_train_ros, y_train_ros)
ros_prediction = ros_model.predict(X_test)
# Check the model performance
print(classification_report(y_test, ros_prediction))

SMOTE

In [None]:
# Randomly over sample the minority class
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote= smote.fit_resample(X_train, y_train)
# Check the number of records after over sampling
print(sorted(Counter(y_train_smote).items()))

In [None]:
# Convert the data from numpy array to a pandas dataframe
df_smote = pd.DataFrame({'feature1': X_train_smote[:, 0], 'feature2': X_train_smote[:, 1], 'target': y_train_smote})
# Plot the chart
plt.figure(figsize=(12, 8))
sns.scatterplot(x = 'feature1', y = 'feature2', hue = 'target', data = df_smote)
plt.title('SMOTE Over Sampling')

In [None]:
# Train the random forest model
# rf = RandomForestClassifier()
smote_model = rf.fit(X_train_smote, y_train_smote)
smote_prediction = smote_model.predict(X_test)
# Check the model performance
print(classification_report(y_test, smote_prediction))