In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [30]:
from sklearn.metrics import r2_score
def tuner(values, x_train, y_train, x_val, y_val):
    # Inputs:
    #         values: an iterable with the values to tune
    #         x_train: training feature array
    #         y_train: training target
    #         x_val: validation feature array
    #         y_val: validation target
    # Output: float: value of best alpha in values

    # Initializes best alpha as the first given
    model = Ridge(alpha = values[0]).fit(x_train, y_train)
    max_r2 = r2_score(y_val, model.predict(x_val))
    max_val = values[0]

    # Finds best alpha
    for value in values:
        model = Ridge(alpha = value).fit(x_train, y_train)
        if r2_score(y_val, model.predict(x_val)) > max_r2:
            max_r2 = r2_score(y_val, model.predict(x_val))
            max_val = value

    return max_val


In [3]:
# Reads data
features = pd.read_csv("data/features.csv")
target = pd.read_csv("data/target.csv")

In [4]:
# Bins target values
from sklearn.preprocessing import KBinsDiscretizer

discretizer = KBinsDiscretizer(n_bins=4, encode = "onehot-dense", strategy = "quantile")

target_discretized = discretizer.fit_transform(target["G3"].values.reshape(-1, 1))
target_new = []
for row in target_discretized:
    target_new.append(list(row).index(1))

In [34]:
# Splits data into training, testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = .2, random_state = 1001)

# Gets Validation sets
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = .5, random_state = 1001)

In [35]:
# Gets final grade for target
t_train = y_train["G3"]
t_test = y_test["G3"]
t_val = y_val["G3"]

In [36]:
# Imports Ridge and accuracy metrics
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Does hyper-parameter tuning
alpha = tuner([1, 10, 100, 150, 200], X_train, y_train, X_val, y_val)

# Performs ridge regression
model = Ridge(alpha = alpha).fit(X_train, t_train)

# Measures accuracy
print("R-squared value for training set: ", r2_score(t_train, model.predict(X_train)))
print("R-squared value for testing set: ", r2_score(t_test, model.predict(X_test)))

R-squared value for training set:  0.33995559805826414
R-squared value for testing set:  0.2653063853580533


In [40]:
# Splits data for classification into training, testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target_new, random_state = 1001)

In [43]:
# Imports SVC and accuracy metrics
from sklearn.svm import SVC
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Performs SVC
model = SVC(gamma = "scale").fit(X=X_train, y=y_train)

# Measures accuracy
accuracy_train = model.score(X_train, y_train)
accuracy_test = model.score(X_test, y_test)
print("Prediction accuracy on the train data:", f"{accuracy_train:.2%}")
print("Prediction accuracy on the test data:", f"{accuracy_test:.2%}")

Prediction accuracy on the train data: 37.42%
Prediction accuracy on the test data: 33.72%


In [44]:
# Imports SVC and accuracy metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Performs SVC
model = DecisionTreeClassifier().fit(X=X_train, y=y_train)

# Measures accuracy
accuracy_train = model.score(X_train, y_train)
accuracy_test = model.score(X_test, y_test)
print("Prediction accuracy on the train data:", f"{accuracy_train:.2%}")
print("Prediction accuracy on the test data:", f"{accuracy_test:.2%}")

Prediction accuracy on the train data: 100.00%
Prediction accuracy on the test data: 35.63%
