In [18]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.linear_model import LassoLarsIC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict

def calculateAic(y, y_pred, n, k):
    res = y - y_pred
    logL = -(n * 1/2) * (1 + np.log(2 * np.pi)) - (n / 2) * np.log(res.dot(res) / n)

    AIC = (-2 * logL) + (2 * k)
    # print("\nThe AIC is ", AIC)
    return AIC
    
def calculateBic(y, y_pred,n, k):
    res = y - y_pred
    logL = -(n/2) * (1 + np.log(2 * np.pi)) - (n / 2) * np.log(res.dot(res) / n)

    BIC = (-2 * logL) + (k * np.log(n))
    # print("\nThe BIC is ", BIC)
    return BIC
    
def calculateAicc(inputAic, n , k):
    aicc = inputAic + 2*k*(k+1)/(n - k -1)
    return aicc

def createTrainTestData(dframe, train_size, seed, Y_col_name):
    training_data, testing_data = train_test_split (dframe, 
                                                    train_size=train_size, 
                                                    random_state=random_seed)
    

    X_train = training_data.drop(Y_col_name ,axis = 1).to_numpy()
    
    scaler = preprocessing.MinMaxScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)

    Y_train = training_data[Y_col_name].to_numpy()
    X_test = testing_data.drop(Y_col_name, axis = 1).to_numpy()
    X_test = scaler.transform(X_test)
    Y_test = testing_data[Y_col_name].to_numpy()
    
    return X_train, Y_train, X_test, Y_test

def runCriterionModel(criteria, seed, train_size, data, Y_col_name):
    
    X_train, Y_train, X_test, Y_test = createTrainTestData(data, 
                                                           train_size, 
                                                           seed, Y_col_name)
    
    
    model = LassoLarsIC(criterion=criteria, normalize=False)
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)

    score_model = model.score(X_test, Y_test)
    aic = calculateAic(Y_test, Y_pred, np.shape(X_test)[0], np.shape(X_test)[1])
    bic = calculateBic(Y_test, Y_pred, np.shape(X_test)[0], np.shape(X_test)[1])
    # print("n : ", np.shape(X_test)[0])
    # print("k : ", np.shape(X_test)[1])
    aicc = calculateAicc(aic, np.shape(X_test)[0], np.shape(X_test)[1])
    
    printData(train_size, score_model, model.alpha_, aic, bic, aicc)
    
    return model.alpha_
    
def runLassoModel(seed, train_size, data, Y_col_name, inputAlpha):
    
    X_train, Y_train, X_test, Y_test = createTrainTestData(data, 
                                                           train_size, 
                                                           seed, Y_col_name)
    
    mdl_lasso = Lasso(alpha = inputAlpha)
    mdl_lasso.fit(X_train, Y_train)

    Y_pred = mdl_lasso.predict(X_test)

    score_model = mdl_lasso.score(X_test, Y_test)
    aic = calculateAic(Y_test, Y_pred, np.shape(X_test)[0], np.shape(X_test)[1])
    bic = calculateBic(Y_test, Y_pred, np.shape(X_test)[0], np.shape(X_test)[1])
    # print("n : ", np.shape(X_test)[0])
    # print("k : ", np.shape(X_test)[1])
    aicc = calculateAicc(aic, np.shape(X_test)[0], np.shape(X_test)[1])
    
    printData(train_size, score_model, inputAlpha, aic, bic, aicc)
    
#4 Calculate AIC, AICc (i.e. Corrected AIC) and BIC for the models you built in question 1 and question 2.
def printData(train_size, Rscore, alpha, aic, bic, aicc):
    
    print("Train_size: ", train_size)
    print("R2: ", Rscore)
    print("alpha: ", alpha)
    print("AIC: ", aic)
    print("BIC: ", bic)
    print("AICC", aicc)
    
print("AIC values are better for the AIC model with the value of alpha")
print("AIC and BIC hold the same interpretation in terms of model comparison.")
print("That is, the larger difference in either AIC or BIC indicates stronger evidence for one model over the other ")
print("(the lower the better). It's just the the AIC doesn't penalize the number of parameters as strongly as BIC.")
print(" There is also a correction to the AIC (the AICc) that is used for smaller sample sizes. ")
print("BIC is greater than AIC in these cases.However that need not necessarily be the case.")

#Comparing AIC and AICc

print("When the number of observations is large the Akaike Information Criterion (AIC) and the small-sample corrected Akaike Information Criterion (AICc)\nbecome extremely similar because AICc converges to AIC\nTherefore we gain (or lose) almost nothing by switching between the two criteria.")
print("AIC is lower than AICc in all of the models")

random_seed = 144

input_train_size = [0.8, 0.2]

df = pd.read_csv('assignment4.csv')

df.drop(['Unnamed: 0'], axis = 1, inplace= True)

# 1 Using the dataset, create Lasso regression models to predict the ‘mpg’ for a car using 20% and 80% of the
# training data for the 2 models

#Lasso regression models predicting the mpg for both 80 and 20
print("\n######LASSO MODEL######")

for t_size in input_train_size:
    runLassoModel(random_seed, t_size, df, 'mpg', 1)
    print("\n")
print("R2 value is 57% that is 57 %of the variation can be explained using this model")

print("\n######AIC MODEL######")

# 2 and 3
#For the above model, tune ‘alpha’ using the Akaike Information Criterion (AIC) and Bayesian Information Criterion (BIC) to
# find the optimum value for it. Explain how that value for alpha was chosen. Build a model using that value of alpha.

for t_size in input_train_size:
    alpha = runCriterionModel('aic', random_seed, t_size, df, 'mpg')
    
    #Build a model using that value of alpha
    print("\nLasso with AIC alpha")
    runLassoModel(random_seed, t_size, df, 'mpg', alpha)
    print("\n")
    
    
print("\n######BIC MODEL######")

for t_size in input_train_size:
    alpha = runCriterionModel('bic', random_seed, t_size, df, 'mpg')
    
    #Build a model using that value of alpha
    print("\nLasso with BIC alpha")
    runLassoModel(random_seed, t_size, df, 'mpg', alpha)
    print("\n")
    
print("AIC values indicate a better-fit model, and a model with a delta-AIC (the difference between the two AIC")
print("values being compared) of more than -2 is considered significantly better than the model in comparison")
print("The selected alpha corresponds to the minimum of the AIC or BIC criterion")
print("It fits by computing the criterion on the specified in-sample collection. Both criteria use the training set error to")
print("predict the model generalization error and penalize this too optimistic error.")
print("This cost, however, is contingent on accurate measurement of the degrees of freedom and noise variance.")
print("Both are calculated for large samples (asymptotic results) and assume that the model is true, that is, that the data are")
print("created by the model.")

# When the problem is poorly conditioned, these models likewise likely to fail (more features than samples). 
# After that, an estimate of the noise variance is necessary.

############################## 10 fold cv  ####################################

#3 Build a simple regression model using 10-fold cross validation for the same data. 
#Write your observations about the R-squared values for the models and their predictions.

print("\n######10 fold CV######")
lm = LinearRegression()
folds = KFold(n_splits = 10, shuffle = True, random_state = random_seed)
X = df.drop(['mpg'], axis = 1).to_numpy()
Y = df['mpg'].to_numpy()
scores = cross_val_score(lm, X, Y, scoring='r2', cv=folds)
#R scores
print("The R2 scores are \n", scores)



#Predictions
y_pred = cross_val_predict(lm, X, Y, cv = folds)
print("Predictions: \n", y_pred)
#In k-fold cross-validation, we first shuffle our dataset so the order
# of the inputs and outputs are completely random. We do this step to make sure that our inputs are not biased in any way.
#from the predictions we see that cross-validation allowed us to choose a better model with a smaller order
#avoided the overfitting problem we encountered when we don’t perform any type of cross-validation, especially with small datasets.

print("The R2 values are better than in the rest of the models.Each of it explains the variation in the model")

print("alpha: one technique uses simply the training set and some information criterion to identify the optimal value of alpha, ")
print("while another uses cross-validation.")
print("In this case, both ways are equally effective. Even in terms of computational performance, the in-sample hyperparameter")
print("selection seems to be effective. It can only be employed when the number of samples compared to the number of ")
print("characteristics is large enough.")
print("That's why cross-validation hyperparameter optimization is a safe strategy: it works in a variety of situations.")






AIC values are better for the AIC model with the value of alpha
AIC and BIC hold the same interpretation in terms of model comparison.
That is, the larger difference in either AIC or BIC indicates stronger evidence for one model over the other 
(the lower the better). It's just the the AIC doesn't penalize the number of parameters as strongly as BIC.
 There is also a correction to the AIC (the AICc) that is used for smaller sample sizes. 
BIC is greater than AIC in these cases.However that need not necessarily be the case.
When the number of observations is large the Akaike Information Criterion (AIC) and the small-sample corrected Akaike Information Criterion (AICc)
become extremely similar because AICc converges to AIC
Therefore we gain (or lose) almost nothing by switching between the two criteria.
AIC is lower than AICc in all of the models

######LASSO MODEL######
Train_size:  0.8
R2:  0.5742378400198931
alpha:  1
AIC:  481.5073586106532
BIC:  498.18154505337037
AICC 483.062914166

In [8]:
# #5 What are Randomized Control Trials: A/B Tests? 
# #What is its significance? Elaborate. Explain methods used in analyzing the RCT data?

# A randomized controlled trial is a type of experiment that is used to control variables that aren't directly controlled in the experiment.
# RCT randomly assigns subjects or volunteers to one of two groups: experimental or control, with the difference being the variable being examined.
# Randomness in the assignment of individuals to treatments decreases biases such as selection and allocation bias, while also balancing both known and unknown prognostic factors.
# A/B testing is a straightforward randomized controlled trial. For a single vector variable, two samples (A and B) are compared. A/B testing compares two variations of a single variable,
# usually by comparing a subject's response to variant A to variant B and determining which form is better.
# To make educated decisions, A/B testing and randomized controlled trials are performed.
# They all have the same goal: to see if your hypothesis is accurate,
# to test the effectiveness of interventions using data, and to try to identify causal links. Despite the fact that you can get answers to these concerns, the strategy you use will ultimately be determined by your primary concern and context.
# RCTs, as opposed to simple correlations, are regarded the gold standard for learning about causal links. This is primarily due to the design's randomization, which removes (or at least reduces) bias. This type of experiment allows you to see how a
# certain variable effects an outcome by keeping everything else constant.
# For marketing departments and enterprises, A/B testing has become a must.A/B testing has become a go-to method
# for marketing teams and businesses trying to make quick decisions that result in better outcomes.
# This is due to the fact that versions can be tried simultaneously across an audience, and the data will show
# (sometimes in real time) which performs best.
# In RCTs, the three statistical approaches of longitudinal analysis of covariance, repeated measures analysis ,
# and analysis of changes are most commonly employed to evaluate treatment effects
# Analysis of RCT methods:
# The main idea behind a summary statistic is to condense the longitudinal evolution of an outcome variable
# across time into a single value. These summary statistics can be compared between the intervention and control
# groups using a relatively basic cross-sectional analysis to assess the intervention's effectiveness.
# The area under the curve (AUC) is one of the most commonly used summary statistics . However, more advanced statistical
# methods such as mixed model analysis and generalised estimating equations are commonly used nowadays (GEE analysis)
# Though GLM for repeated measurements is not a novel (more sophisticated) statistical technique for longitudinal data
# analysis, it can be used to analyze a continuous outcome variable measured in an RCT with several follow-up measurements.
# GLM for repeated measurements (also known as (multivariate) analysis of variance ((M)ANOVA) for repeated measurements) is
# based on the same principle as the well-known paired t-test. T 1 absolute discrepancies between consecutive measurements
# are subjected to a statistical test.
# Multilevel analysis, hierarchical linear modeling, and random effects modeling are all terms used to describe mixed model
# analysis. As previously stated, the main goal behind all longitudinal statistical approaches is to efficiently account
# for'subject.' Adjusting for'subject' actually involves estimating different intercepts for all subjects in the longitudinal
# research. The core premise of mixed model analysis in longitudinal research is that (just one) variance of those intercepts,
# i.e. a random intercept, is estimated rather than all independent intercepts.
# Is the sample representative of the entire population?
# RCTs are typically conducted on a sample of people rather than the entire population.
# It's critical for the experiment that the chosen sample accurately reflects the population's baseline characteristics.
# \Inferential leaps or generalizations from samples to populations aren't always easy, and they're rarely foolproof.
# Is the sample size adequate for the target population?
# Another crucial step is to select an appropriate sample size that yields statistically meaningful clinical differences.
# To avoid statistical error, sample size estimation should be done just prior to the trial and should not be altered while
# the study is running. Multiple factors influence the size of a study, including the accepted threshold of significance
# (alpha error), study power, predicted effect size, occurrence rate in the population (prevalence rate),
# alternative hypothesis, and population standard deviation. There are methods for calculating sample size,
# but understanding the relationship between each element and sample size is more important.
# Designing  Effectively
# Experimental design is preferable to observational design because it allows for a better grasp of variables
# \nand the establishment of a cause–effect hypothesis. Preexperimental, quasi-experimental, and real experimental
# study designs are all used in experiments. The absence or presence of group randomization distinguishes quasi-experimental
# and real experimental designs.
# Are Bias-Reducing Measures (Selection or Confouding Bias) Being Taken?
# Interventional studies/RCTs are intended to assess the efficacy and safety of a new treatment for a
# clinical illness. It is critical that the outcome is not random.
# A range of methods, such as control selection, randomization, blinding, and allocation concealment, can assist
# eliminate confounding factors and bias.