In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
# Reads data
features = pd.read_csv("data/features.csv")
target = pd.read_csv("data/target.csv")

In [5]:
# Bins target values
from sklearn.preprocessing import KBinsDiscretizer

discretizer = KBinsDiscretizer(n_bins=4, encode = "onehot-dense", strategy = "quantile")

target_discretized = discretizer.fit_transform(target["G3"].values.reshape(-1, 1))
target_new = []
for row in target_discretized:
    target_new.append(list(row).index(1))

In [12]:
counts = [0] * 21
for datum in target["G3"].values:
    counts[datum] += 1
counts

[53,
 1,
 0,
 0,
 1,
 8,
 18,
 19,
 67,
 63,
 153,
 151,
 103,
 113,
 90,
 82,
 52,
 35,
 27,
 7,
 1]

In [7]:
counts = [0, 0, 0, 0]
for datum in target_new:
    counts[datum] += 1
counts

[230, 153, 367, 294]

In [5]:
# Splits data into training, testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 1001)

In [6]:
# Gets final grade for target
t_train = y_train["G3"]
t_test = y_test["G3"]

In [7]:
# Imports Ridge and accuracy metrics
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Performs Ridge Regression
model = Ridge(alpha = 100).fit(X=X_train, y=t_train)

# Measures accuracy
print("R-squared value for training set: ", r2_score(t_train, model.predict(X_train)))
print("R-squared value for testing set: ", r2_score(t_test, model.predict(X_test)))

R-squared value for training set:  0.30899801584217534
R-squared value for testing set:  0.22910901629507463


In [17]:
# Imports Lasso and accuracy metrics
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Performs Lasso Regression
model = Lasso(alpha = .01).fit(X=X_train, y=t_train)

# Measures accuracy
print("R-squared value for training set: ", r2_score(t_train, model.predict(X_train)))
print("R-squared value for testing set: ", r2_score(t_test, model.predict(X_test)))

R-squared value for training set:  0.33565005368951184
R-squared value for testing set:  0.22893162371533482


In [9]:
# Splits data for classification into training, testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target_new, random_state = 1001)

In [10]:
# Imports SVC and accuracy metrics
from sklearn.svm import SVC
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Recommended using kernel

# Performs SVC
model = SVC(gamma = 0.01).fit(X=X_train, y=y_train)

# Measures accuracy
accuracy_train = model.score(X_train, y_train)
accuracy_test = model.score(X_test, y_test)
print("Prediction accuracy on the train data:", f"{accuracy_train:.2%}")
print("Prediction accuracy on the test data:", f"{accuracy_test:.2%}")

Prediction accuracy on the train data: 60.15%
Prediction accuracy on the test data: 40.23%


In [11]:
# Imports Decision Tree and accuracy metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Performs Decision Tree Classifier
model = DecisionTreeClassifier().fit(X=X_train, y=y_train)

# Recommended using kernel -> we want to be able to model nonlinear things!
# Want to use feature expansion to classify them
# To choose the kernel: brute force/ try out a bunch of different ones (look at data and see if there's anything interesting you can pick up on, for example see if there's any 2 or more things that you can take the product of and it seems interesting)
# kernel: considering n features at the same time

# Measures accuracy
accuracy_train = model.score(X_train, y_train)
accuracy_test = model.score(X_test, y_test)
print("Prediction accuracy on the train data:", f"{accuracy_train:.2%}")
print("Prediction accuracy on the test data:", f"{accuracy_test:.2%}")

Prediction accuracy on the train data: 100.00%
Prediction accuracy on the test data: 38.70%
