In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
def get_TFCE(dataframe_mean, dataframe_clean, column_mean = "mean", column_corr="ALEM", column_IOEM="IOEM"):
    """
    Function to calculate TFCE. TFCE = index, error calculated as correlation with distance between ALEM and mean of IOEM for each argument and IOEM of the subject for each argument (30 items)
    :param dataframe_mean: The dataframe with only the means of IOEM and ResponseId as index
    :param column_corr: name of the column with which you calculate the distance as a string
    :param column_IOEM: name of the column with which you calculate final correlation as a string
    :return: tfce dataframe with index = responseid and column with correlation values
    """
    # Join main dataframe with mean dataframe
    df = pd.merge(dataframe_clean, dataframe_mean, on='Topic', how='outer')
    # Add distance
    df.insert(4, 'x', abs(df[column_corr] - df[column_mean]))
    df = df.drop([column_mean], axis=1)

    # Calculate TFCE
    tfce = df.groupby('ResponseId')[[column_IOEM, 'x']].corr().iloc[0::2,-1]

    tfce2 = pd.DataFrame(tfce)

    tfce2.rename(columns = {'x':'TFCE'}, inplace = True)
    tfce = tfce2.reset_index([None])

    tfce = tfce.drop(['level_1'], axis=1)
    return tfce

In [None]:
def Lasso_regression(X, y):
    """

    :param X: The predictors in a pandas dataframe
    :param y: The value to be predicted in a pandas dataframe
    :return: R-squared, RMSE, final_coeffs
    """

    #Scaling everything
    scaler = StandardScaler()
    X_sc = scaler.fit_transform(X)
    Y_sc = scaler.fit_transform(y)

    X_train, X_test, Y_train, Y_test = train_test_split(X_sc, Y_sc, test_size=0.1)

    param = {
        'alpha': [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1],
        'fit_intercept':[True,False],
        'positive':[True,False],
        'selection':['cyclic','random'],
        }

    #define model
    model = Lasso()

    # define search
    search = GridSearchCV(model, param, scoring='neg_mean_absolute_error', n_jobs=-1)

    # execute search
    result = search.fit(X_train, Y_train)

    # summarize result
    print('Best Score: %s' % result.best_score_)
    print('Best Hyperparameters: %s' % result.best_params_)

    # Best model
    lasso_best = Lasso(alpha=result.best_params_['alpha'], fit_intercept=result.best_params_['fit_intercept'], positive=result.best_params_['positive'], selection=result.best_params_['selection']).fit(X_train,Y_train)
    y_pred = lasso_best.predict(X_test)
    r2 = r2_score(Y_test, y_pred)
    print(r2)
    root_mean_squared_error = np.sqrt(mean_squared_error(Y_test,y_pred))
    print(root_mean_squared_error)
    final_coeffs = pd.Series(lasso_best.coef_, index = X.columns)
    print(final_coeffs)

    return r2, root_mean_squared_error, final_coeffs