In [18]:
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression,Ridge,PoissonRegressor
from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np
from collections import defaultdict
import os
import re

In [5]:
foundFile = False
while not foundFile:
    path = input('Enter your filepath to the data (.csv only, enter "exit" to end): ')
    if path == 'exit':
        break
    elif path[-4:] != '.csv':
        print("That's not a csv file!")
        continue
    try:
        df = pd.read_csv(path)
        foundFile = True
    except FileNotFoundError:
        print('File not found. Enter your filepath again.')
print('============ Column Names ============')
print(df.columns)
print('============ DataFrame ============')
print(df)

Index(['student_id', 'age', 'gender', 'course', 'study_hours',
       'class_attendance', 'internet_access', 'sleep_hours', 'sleep_quality',
       'study_method', 'facility_rating', 'exam_difficulty', 'exam_score'],
      dtype='object')
       student_id  age  gender  ... facility_rating  exam_difficulty  exam_score
0               1   17    male  ...             low             hard        58.9
1               2   23   other  ...          medium         moderate        54.8
2               3   22    male  ...            high         moderate        90.3
3               4   20   other  ...             low         moderate        29.7
4               5   20  female  ...             low         moderate        43.7
...           ...  ...     ...  ...             ...              ...         ...
19995       19997   18   other  ...             low             easy        86.5
19996       19998   18    male  ...          medium         moderate        60.9
19997       19999   19   other  

In [7]:
cols = df.columns[df.dtypes.eq('object')]
for c in cols:
    df[c] = df[c].astype('category').cat.codes


d = input('Enter the column(s) to drop (Enter "#NONE" if none): ')
li = d.split(' ')
print(li)
if li != ['#NONE']:
    for c in li:
        try:
            df.drop(columns=[c])
        except KeyError:
            print('Drop column failed, the column was', c)

scaler = StandardScaler()
scaled = scaler.fit_transform(df)

print(scaled)

['student_id']
[[-1.73196262 -1.52044246 -0.00901182 ... -0.00966555 -0.21348541
  -0.19109485]
 [-1.73178942  1.10606684  1.2170862  ...  1.21382199  0.9290643
  -0.40793407]
 [-1.73161622  0.66831529 -0.00901182 ... -1.23315309  0.9290643
   1.46957626]
 ...
 [ 1.73178783 -0.64493936  1.2170862  ... -0.00966555 -1.35603512
   0.10507579]
 [ 1.73196103 -0.64493936 -0.00901182 ...  1.21382199  0.9290643
   0.87194621]
 [ 1.73213424 -0.20718781 -0.00901182 ...  1.21382199  0.9290643
   0.44884529]]


In [None]:
while True:
    response = input('Enter the response variable: ')
    try:
        x = df.drop(columns=[response])
        y = df[response]
        
        break
    except KeyError:
        print('That column does not exist, please try again.')
        continue

while True:
    methods = input('Choose the regression methods you want to add (1. Linear regression 2. Polynomial 3. Ridge 4. Lasso 5. Logistic 6. Poisson )')
    methods_list = methods.split(' ')
    have_methods = [False for _ in range(8)]
    have = False
    global valid_li
    valid_li = [m for m in methods_list if re.match(pattern=r'^[1-7]$',string=m)]
    for m in methods_list:
        try:
            if int(m) >= 1 and int(m) <= 7:
                have_methods[int(m)-1] = True
                have = True
        except ValueError:
            print('Error!!!! The input ' + m + ' is not valid.')
    if have:
        break

cpu_cores = os.cpu_count()

linear_parems = defaultdict(list)


# fit_intercept: have x and y intercept or not
# tol: The precision of the solution (coef_) is determined by tol which specifies a different convergence criterion for the lsqr solver.
# n_jobs: how much cpu cores will be used
# positive: force the coeffients to be positive or not
valid_li=[]
def do_linear():
    params_to_test = input('Enter the parameters you want to test. (1. fit_intercept 2. tol 3. n_jobs 4. positive')
    parems_li = params_to_test.split(' ')
    # filiter invalid inputs list like [fdjhdsafl, 1, 2, dfakjfk, 3, 100]
    
    for p in parems_li:
        # fit_intercept
        if p == '1':
            fit_inter = input('Enter the values you want to try on '+'fit_intercept'+' (T/F)')
            fit_list = set(fit_inter.split(' '))
            if "T" in fit_list:
                linear_parems['fit_intercept'].append(True)
            elif "F" in fit_list:
                linear_parems['fit_intercept'].append(False)
        # tol
        elif p == '2':
            tol = set(input('Enter the values you want to try on '+'tol'+' (float numbers)').split(' '))
            for t in tol:
                try:
                    if float(t) < 1:
                        linear_parems['tol'].append(float(t))
                    else:
                        print(t +'is too big!!!!!!!!!!!!!!')
                except ValueError:
                    print(t +'was an invalid input!!!!!!!!!!!!!!')
        # n_jobs
        elif p == '3':
            nj = set(input('Enter the values you want to try on '+'n_jobs'+' (integer numbers)').split(' '))
            for n in nj:
                try:
                    if int(n) < 0:
                        if abs(int(n)) < cpu_cores:
                            linear_parems['n_jobs'].append(int(n))
                    elif int(n) >0:
                        if abs(int(n)) <= cpu_cores:
                            linear_parems['n_jobs'].append(int(n))
                except ValueError:
                    print(n +'was an invalid input!!!!!!!!!!!!!!')
        # positive
        elif p == '4':
            pos = set(input('Enter the values you want to try on '+'positive'+' (T/F)').split(' '))
            if "T" in pos:
                linear_parems['positive'].append(True)
            elif "F" in pos:
                linear_parems['positive'].append(False)


poly_parem = defaultdict(list)

# interaction_only == True --> a1,a2 degree=2 --> won't generate a1^2 and a2^2 (force the inputs to interact)
# include_bias: add bias(constant) into our model or not
# order: we have x1,x2,x1^2,x2^2 -> 'C':x1,x1^2,x2,x2^2 | 'F':x1,x2,x1^2,x2^2

def do_poly():
    params_to_test = input('Enter the parameters you want to test. (1. degree 2. interaction_only 3. include_bias 4. order').split(' ')
    for p in params_to_test:
        # degree
        # 4,6 7
        if p == '1':
            deg = set(input('Enter the degree you want (seperate min max with commas)').split(' '))
            for d in deg:
                if re.search(r'^[0-9]+,[0-9]+$',d):
                    ds = d.split(',')
                    try:
                        if int(ds[0]) < int(ds[1]):
                            poly_parem['degree'].append((ds[0],ds[1]))
                        else:
                            print(d,' is not a valid input!!!!!!!!')
                    except ValueError:
                        print(d,' is not a valid input!!!!!!!!')
                elif re.search(r'^[0-9]+$',d):
                    try:
                        poly_parem['degree'].append(int(d))
                    except ValueError:
                        print(d,' is not a valid input!!!!!!!!')
        # interaction_only
        elif p == '2':
            deg = set(input('Enter the degree you want (T/F)').split(' '))
            for d in deg:
                if d == 'T':
                    poly_parem['interaction_only'].append(True)
                elif d == 'F':
                    poly_parem['interaction_only'].append(False)
                else:
                    print(d,' is not a valid input!!!!!!!!')
        # include_bias
        elif p == '3':
            deg = set(input('Do you want to add a bias column into the model (T/F)').split(' '))
            for d in deg:
                if d == 'T':
                    poly_parem['include_bias'].append(True)
                elif d == 'F':
                    poly_parem['include_bias'].append(False)
                else:
                    print(d,' is not a valid input!!!!!!!!')
        #order
        elif p == '4':
            orders = set(input('Enter the order you want to try (C/F)').split(' '))
            for o in orders:
                if o == 'C':
                    poly_parem['order'].append('C')
                elif o == 'F':
                    poly_parem['order'].append('F')

# ridge regression

ridge_parems = defaultdict(list)

# alpha: regularlization strength[0,np.inf)
# tol: precision

def do_ridge():
    pars = set(input('Enter the parems you want change').split(' '))
    valid_solvers = {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}
    global ridge_parems
    for p in pars:
        match p:
            case '1':
                alphas = set(input('Enter the alpha you want to try [0,np.inf)').split(' '))
                for a in alphas:
                    try:
                        if a == 'np.inf':
                            ridge_parems['alpha'].append(np.inf)
                        ridge_parems['alpha'].append(float(a))
                    except ValueError:
                        print(a,'was an invalid input!!!!!!!!!!!!!!!!!!!!')
            case '2':
                tols = set(input('Enter the values you want to try on '+'tol'+' (float numbers)').split(' '))
                for t in tols:
                    try:
                        if float(t) < 1:
                            ridge_parems['tol'].append(float(t))
                        else:
                            print(t +'is too big!!!!!!!!!!!!!!')
                    except ValueError:
                        print(t +'was an invalid input!!!!!!!!!!!!!!')
            case '3':
                solves = set(input('Enter the values you want to try on '+'solvers').split(' '))
                for s in solves:
                    if s in valid_solvers:
                        ridge_parems['tol'].append(s)
                    else:
                        print(s +'was an invalid input!!!!!!!!!!!!!!')
                

l_parems = {}

# precompute: true 比較快但少了sparse
# selection: iteration後調係數的方法

def do_lasso():
    pars = set(input('Enter the (alpha,tol,solver)').split(' '))
    valid_solvers = {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}
    global l_parems
    for p in pars:
        match p:
            case '1':
                alphas = set(input('Enter the alpha you want to try [0,np.inf)').split(' '))
                for a in alphas:
                    try:
                        if a == 'np.inf':
                            l_parems['alpha'].append(np.inf)
                        l_parems['alpha'].append(float(a))
                    except ValueError:
                        print(a,'was an invalid input!!!!!!!!!!!!!!!!!!!!')
            case '2':
                tols = set(input('Enter the values you want to try on '+'tol'+' (float numbers)').split(' '))
                for t in tols:
                    try:
                        if float(t) < 1:
                            l_parems['tol'].append(float(t))
                        else:
                            print(t +'is too big!!!!!!!!!!!!!!')
                    except ValueError:
                        print(t +'was an invalid input!!!!!!!!!!!!!!')
            case '3':
                pres = set(input('Enter the values you want to try on '+'precompute').split(' '))
                for p in pres:
                    if p =='T':
                        l_parems['precompute'].append(True)
                    elif p=='F':
                        l_parems['precompute'].append(False)
            case '4':
                sels = set(input('Enter the values you want to try on '+'selection').split(' '))
                for s in sels:
                    if s == 'cyclic':
                        l_parems['selection'].append('cyclic')
                    elif s == 'random':
                        l_parems['selection'].append('random')

poi_parems = {}

def do_poisson():
    pars = set(input('Enter the (alpha,tol,solver)').split(' '))
    valid_solvers = {'newton-cholesky', 'lbfgs'}
    global poi_parems
    for p in pars:
        match p:
            case'1':
                alphas = set(input('Enter the alpha you want to try [0,np.inf)').split(' '))
                for a in alphas:
                    try:
                        if a == 'np.inf':
                            poi_parems['alpha'].append(np.inf)
                        poi_parems['alpha'].append(float(a))
                    except ValueError:
                        print(a,'was an invalid input!!!!!!!!!!!!!!!!!!!!')
            case '2':
                solves = set(input('Enter the solver you want to try').split(' '))
                for s in solves:
                    if s in valid_solvers:
                        poi_parems['solver'].append(np.inf)
            case '3':
                tols = set(input('Enter the values you want to try on '+'tol'+' (float numbers)').split(' '))
                for t in tols:
                    try:
                        if int(t) > 0:
                            poi_parems['tol'].append(float(t))
                        else:
                            print(t +'is too small!!!!!!!!!!!!!!')
                    except ValueError:
                        print(t +'was an invalid input!!!!!!!!!!!!!!')

            
            
                
for i,v in enumerate(have_methods):
    if i == 0 and v:
        do_linear()
    elif i == 1 and v:
        do_poly()
    elif i == 2 and v:
        do_ridge()
    elif i==3 and v:
        do_poisson()

np.inf was an invalid input!!!!!!!!!!!!!!!!!!!!
dfahdafhsidhsfaihn was an invalid input!!!!!!!!!!!!!!!!!!!!
dfaijofdaodiafjowas an invalid input!!!!!!!!!!!!!!
dsajadfswas an invalid input!!!!!!!!!!!!!!
2is too big!!!!!!!!!!!!!!
defaultdict(<class 'list'>, {})


In [14]:
valid_li = ['1','2','3']

In [None]:
best_fits = {}



def grid_searching(reg_type,reg_parems,reg_name):
    global best_fits
    clf = GridSearchCV(reg_type,reg_parems)
    clf.fit(x,y)
    print(reg_name,'with the paremeters of ',clf.best_params_,' has a score of ',clf.best_score_)

for v in valid_li:
    match v:
        case '1':
            grid_searching(LinearRegression(),linear_parems,'Linear')
        case '2':
            grid_searching(make_pipeline(PolynomialFeatures(),LinearRegression()),poly_parem,'Polynomial')
        case '3':
            grid_searching(Ridge(),ridge_parems,'Ridge')
        case '4':
            grid_searching(PoissonRegressor(),poi_parems,'Poisson')


#   type  best_score  best_parem
# {'linear':[0.59,{'C':np.inf}]}


Linear with the paremeters of  {}  has a score of  0.6685802757167701
Polynomial with the paremeters of  {}  has a score of  0.7213130120887226
Ridge with the paremeters of  {'alpha': 0.00123, 'tol': 1e-08}  has a score of  0.6685802757831155


25 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/home/yang/Desktop/regression_comparison/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 833, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/yang/Desktop/regression_comparison/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1329, in wrapper
    estimator._validate_params()
  File "/home/yang/Desktop/regression_comparison/.venv/lib/python3.12/site-packages/sklearn/base.py", line 492, in _validate_params
    validate_parameter_constraints(
  File "/home/yang/Desktop/regression_comparison/.venv/lib

In [21]:
linear_parems

defaultdict(list, {})