In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Reads data
features = pd.read_csv("data/features.csv")
target = pd.read_csv("data/target.csv")

In [3]:
# Bins target values
from sklearn.preprocessing import KBinsDiscretizer

discretizer = KBinsDiscretizer(n_bins=4, encode = "onehot-dense", strategy = "quantile")

target_discretized = discretizer.fit_transform(target["G3"].values.reshape(-1, 1))
target_new = []
for row in target_discretized:
    target_new.append(list(row).index(1))

In [4]:
counts = [0] * 21
for datum in target["G3"].values:
    counts[datum] += 1


In [5]:
counts = [0, 0, 0, 0]
for datum in target_new:
    counts[datum] += 1
counts

[230, 153, 367, 294]

In [6]:
# Splits data into training, testing sets
X_train, X_test_val, y_train, y_test_val = train_test_split(features, target, test_size = 0.3,random_state = 1001)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size = 0.5, random_state = 1001)

In [7]:
# Gets final grade for target
t_train = y_train["G3"]
t_val = y_val["G3"]
t_test = y_test["G3"]

In [38]:
# Imports Ridge and accuracy metrics
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
param_grid = [0.001, 0.01, 0.1, 1, 10, 100]
models = []
for param in param_grid:
    models.append(Ridge(alpha = param).fit(X=X_train, y=t_train))
scores = []
for model in models:
    scores.append(r2_score(t_val, model.predict(X_val)))
best_score = scores[scores.index(max(scores))]
best_param = param_grid[scores.index(max(scores))]
best_model = models[scores.index(max(scores))]

ridge_train_score = r2_score(t_train, best_model.predict(X_train))
ridge_val_score =  r2_score(t_val, best_model.predict(X_val))
ridge_test_score = r2_score(t_test, best_model.predict(X_test))
# Measures accuracy
print("Best Parameter:", best_param)
print("R-squared value for training set: ", r2_score(t_train, best_model.predict(X_train)))
print("R-squared value for testing set: ", r2_score(t_test, best_model.predict(X_test)))

Best Parameter: 100
R-squared value for training set:  0.31458176149234485
R-squared value for testing set:  0.2651854418480488


In [39]:
# Imports Lasso and accuracy metrics
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
param_grid = [0.001, 0.01, 0.1, 1, 10, 100]
models = []
for param in param_grid:
    models.append(Lasso(alpha = param).fit(X=X_train, y=t_train))
scores = []
for model in models:
    scores.append(r2_score(t_val, model.predict(X_val)))
best_score = scores[scores.index(max(scores))]
best_param = param_grid[scores.index(max(scores))]
best_model = models[scores.index(max(scores))]
# Performs Lasso Regression
lasso_train_score = r2_score(t_train, best_model.predict(X_train))
lasso_val_score =  r2_score(t_val, best_model.predict(X_val))
lasso_test_score = r2_score(t_test, best_model.predict(X_test))
# Measures accuracy
print("Best Parameter:", best_param)
print("R-squared value for training set: ", r2_score(t_train, best_model.predict(X_train)))
print("R-squared value for testing set: ", r2_score(t_test, best_model.predict(X_test)))

Best Parameter: 0.01
R-squared value for training set:  0.3446902144741497
R-squared value for testing set:  0.26472075923764216


In [52]:
summary = pd.DataFrame(data = {"Ridge": [ridge_train_score, ridge_val_score, ridge_test_score], "Lasso":[lasso_train_score, lasso_val_score, lasso_test_score]})
summary.index = ["R^2 on training", "R^2 on validation", "R^2 on testing"]
summary.T.to_html("regression.html")

In [31]:
# Splits data into training, valdation, testing sets
X_train, X_test_val, y_train, y_test_val = train_test_split(features, target_new, test_size = 0.3,random_state = 1001)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size = 0.5, random_state = 1001)


In [47]:
# Imports SVC and accuracy metrics
from sklearn.svm import SVC
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Recommended using kernel
param_grid = [0.001, 0.01, 0.1, 1, 10, 100]
models = []
for param in param_grid:
    models.append(SVC(gamma = param).fit(X=X_train, y=y_train))
scores = []
for model in models:
    scores.append(model.score(X_val, y_val))
best_score = scores[scores.index(max(scores))]
best_param = param_grid[scores.index(max(scores))]
best_model = models[scores.index(max(scores))]


# Measures accuracy
kernel_accuracy_train = best_model.score(X_train, y_train)
kernel_accuracy_val = best_score
kernel_accuracy_test = best_model.score(X_test, y_test)
kernel_scores = [kernel_accuracy_train, kernel_accuracy_val, kernel_accuracy_test]
print("Best Parameter:", best_param)
print("Prediction accuracy on the train data:", f"{kernel_accuracy_train:.2%}")
print("Prediction accuracy on the test data:", f"{kernel_accuracy_test:.2%}")

Best Parameter: 0.001
Prediction accuracy on the train data: 48.22%
Prediction accuracy on the test data: 36.94%


In [48]:
from sklearn.svm import LinearSVC
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


model = LinearSVC().fit(X_train, y_train)


# Measures accuracy
accuracy_train = model.score(X_train, y_train)
accuracy_test = model.score(X_test, y_test)
linear_scores = [accuracy_train, "N/A", accuracy_test]
print("Prediction accuracy on the train data:", f"{accuracy_train:.2%}")
print("Prediction accuracy on the test data:", f"{accuracy_test:.2%}")

Prediction accuracy on the train data: 41.10%
Prediction accuracy on the test data: 35.67%


In [49]:
# Imports Decision Tree and accuracy metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVC
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Recommended using kernel
param_grid = [1, 5, 10, 25, 50]
models = []
for param in param_grid:
    models.append(DecisionTreeClassifier(max_depth = param).fit(X=X_train, y=y_train))
scores = []
for model in models:
    scores.append(model.score(X_val, y_val))
best_score = scores[scores.index(max(scores))]
best_param = param_grid[scores.index(max(scores))]
best_model = models[scores.index(max(scores))]

# Performs Decision Tree Classifier
model = DecisionTreeClassifier().fit(X=X_train, y=y_train)

# Recommended using kernel -> we want to be able to model nonlinear things!
# Want to use feature expansion to classify them
# To choose the kernel: brute force/ try out a bunch of different ones (look at data and see if there's anything interesting you can pick up on, for example see if there's any 2 or more things that you can take the product of and it seems interesting)
# kernel: considering n features at the same time

# Measures accuracy
print("Best Parameter:", best_param)
dt_accuracy_train = best_model.score(X_train, y_train)
dt_accuracy_val = best_score
dt_accuracy_test = best_model.score(X_test, y_test)
dt_scores = [dt_accuracy_train, dt_accuracy_val, dt_accuracy_test]
print("Prediction accuracy on the train data:", f"{dt_accuracy_train:.2%}")
print("Prediction accuracy on the test data:", f"{dt_accuracy_test:.2%}")

Best Parameter: 5
Prediction accuracy on the train data: 56.58%
Prediction accuracy on the test data: 35.03%


In [51]:
summary = pd.DataFrame(data = {"RBF Kernel SVM": kernel_scores, "Linear SVM":linear_scores, "Descision Tree": dt_scores})
summary.index = ["Accuracy on training", "Accuracy on validation", "Accuracy on testing"]
summary.T.to_html("classification.html")