### Index
1. Metrics calculation on test set
2. Metrics calculated using k-fold

# 1. Metrics calculation on test set
### Link: https://www.analyticsvidhya.com/blog/2021/05/know-the-best-evaluation-metrics-for-your-regression-model/  (Code taken)

https://medium.com/analytics-vidhya/evaluation-metrics-for-regression-algorithms-along-with-their-implementation-in-python-9ec502729dad (Same metric is there in this link also)

Below topics are covered in this section
1. Mean Absolute Error(MAE)
2. Mean Squared Error(MSE)
3. Root Mean Squared Error(RMSE)
4. Root Mean Squared Log Error(RMSLE)
5. R Squared (R2)
6. Adjusted R Squared

In [1]:
import pandas as pd

In [4]:
data_raw = pd.read_csv("../Data/movie_metadata.csv")

In [12]:
data = data_raw[['num_critic_for_reviews', 'duration', 'imdb_score']].head(10).fillna(data.mean())
data

Unnamed: 0,num_critic_for_reviews,duration,imdb_score
0,723.0,178.0,7.9
1,302.0,169.0,7.1
2,602.0,148.0,6.8
3,813.0,164.0,8.5
4,514.222222,149.0,7.1
5,462.0,132.0,6.6
6,392.0,156.0,6.2
7,324.0,100.0,7.8
8,635.0,141.0,7.5
9,375.0,153.0,7.5


In [13]:
X = data.drop('imdb_score', axis=1)
y = data['imdb_score']

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [18]:
X_train

Unnamed: 0,num_critic_for_reviews,duration
5,462.0,132.0
0,723.0,178.0
7,324.0,100.0
2,602.0,148.0
3,813.0,164.0
6,392.0,156.0
9,375.0,153.0
8,635.0,141.0


In [17]:
y_train

5    6.6
0    7.9
7    7.8
2    6.8
3    8.5
6    6.2
9    7.5
8    7.5
Name: imdb_score, dtype: float64

In [21]:
print(y_test)
print(y_pred)

4    7.1
1    7.1
Name: imdb_score, dtype: float64
[7.23429625 6.30866628]


### 1) Mean Absolute Error(MAE)

In [19]:
from sklearn.metrics import mean_absolute_error
print("MAE",mean_absolute_error(y_test,y_pred))

MAE 0.4628149868413338


### 2) Mean Squared Error(MSE)

In [20]:
from sklearn.metrics import mean_squared_error
print("MSE",mean_squared_error(y_test,y_pred))

MSE 0.3221222703393354


### 3) Root Mean Squared Error(RMSE)

In [23]:
import numpy as np
print("RMSE",np.sqrt(mean_squared_error(y_test,y_pred)))

RMSE 0.5675581647191197


### 4) Root Mean Squared Log Error(RMSLE)

In [24]:
print("RMSE",np.log(np.sqrt(mean_squared_error(y_test,y_pred))))

RMSE -0.5664120419814902


### 5) R Squared (R2)

In [25]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test,y_pred)
print(r2)

0.0


### 6) Adjusted R Squared

In [30]:
n=2
k=2
adj_r2_score = 1 - ((1-r2)*(n-1)/(n-k-1))
print(adj_r2_score)

2.0


# 2. Metrics calculated using k-fold

### Link: https://machinelearningmastery.com/metrics-evaluate-machine-learning-algorithms-python/

In [33]:
# 1. Mean Absolute Error
import pandas
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle=True)
model = LinearRegression()
scoring = 'neg_mean_absolute_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("MAE: %.3f (%.3f)" % (results.mean(), results.std()))

MAE: -3.387 (0.667)


In [36]:
# 2. Mean Squared Error
import pandas
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle=True)
model = LinearRegression()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("MSE: %.3f (%.3f)" % (results.mean(), results.std()))

MSE: -23.747 (11.143)


In [37]:
# 3. R^2 Metric
import pandas
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle=True)
model = LinearRegression()
scoring = 'r2'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("R^2: %.3f (%.3f)" % (results.mean(), results.std()))


R^2: 0.718 (0.099)


In [39]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])