# Statistical modeling for TRIBE2 PanCan 

In [6]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff

In [2]:
arm0_best_dataset = np.loadtxt(
    open("arm0_best_mutations.csv", "rb"), delimiter=",", skiprows=1
)
arm1_best_dataset = np.loadtxt(
    open("arm1_best_mutations.csv", "rb"), delimiter=",", skiprows=1
)

In [3]:
from sklearn.model_selection import train_test_split

arm0_x, arm0_y = arm0_best_dataset[:, :-1], arm0_best_dataset[:, -1]
arm0_x_train, arm0_x_test, arm0_y_train, arm0_y_test = train_test_split(arm0_x, arm0_y, test_size=0.30, random_state=42)
arm1_x, arm1_y = arm1_best_dataset[:, :-1], arm1_best_dataset[:, -1]
arm1_x_train, arm1_x_test, arm1_y_train, arm1_y_test = train_test_split(arm1_x, arm1_y, test_size=0.30, random_state=42)

## Regression

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

arm0_best_model = LinearRegression()
arm0_best_model.fit(arm0_x_train, arm0_y_train)
arm0_best_pred = arm0_best_model.predict(arm0_x_test)
print(f"""
ARM0:
    Coeff: {arm0_best_model.coef_}
    MSE: {mean_squared_error(arm0_y_test, arm0_best_pred)}
    R2: {r2_score(arm0_y_test, arm0_best_pred)}""")

arm1_best_model = LinearRegression()
arm1_best_model.fit(arm1_x_train, arm1_y_train)
arm1_best_pred = arm1_best_model.predict(arm1_x_test)
print(f"""
ARM1:
    Coeff: {arm1_best_model.coef_}
    MSE: {mean_squared_error(arm1_y_test, arm1_best_pred)}
    R2: {r2_score(arm1_y_test, arm1_best_pred)}""")



ARM0:
    Coeff: [-1.04897447e+00 -1.80897720e+02  1.37267681e+02 -6.83245508e+00
  1.02535206e+01  1.91952641e+01 -1.13641313e+05 -1.73820083e+01
 -2.21039449e+00  4.35342608e+00]
    MSE: 102662.60427968217
    R2: -0.010488950903972283

ARM1:
    Coeff: [-1.67821379e+00  3.48759636e+01 -8.83930041e-14 -1.77135169e+00
  1.30501009e+01  1.23897954e+01  2.25863332e+01 -3.79652835e+00
  1.50493241e+01  9.75596632e+01]
    MSE: 72416.1384060918
    R2: -0.1727218351901949


In [5]:
from sklearn.linear_model import Lasso
import seaborn as sns

arm0_best_model = Lasso()
arm0_best_model.fit(arm0_x_train, arm0_y_train)
arm0_best_pred = arm0_best_model.predict(arm0_x_test)
print(f"""
ARM0:
    Coeff: {arm0_best_model.coef_}
    MSE: {mean_squared_error(arm0_y_test, arm0_best_pred)}
    R2: {r2_score(arm0_y_test, arm0_best_pred)}""")

arm1_best_model = Lasso()
arm1_best_model.fit(arm1_x_train, arm1_y_train)
arm1_best_pred = arm1_best_model.predict(arm1_x_test)
print(f"""
ARM1:
    Coeff: {arm1_best_model.coef_}
    MSE: {mean_squared_error(arm1_y_test, arm1_best_pred)}
    R2: {r2_score(arm1_y_test, arm1_best_pred)}""")



ARM0:
    Coeff: [  -1.33078005 -122.70129897  131.79876447   -7.03115446    8.42325792
   17.10134682   -0.          -19.58145036   -3.27362589    6.84621008]
    MSE: 101948.60859571294
    R2: -0.003461223965753213

ARM1:
    Coeff: [-1.71578345 23.92768148  0.         -0.         12.93747728 12.47394641
 22.19447539 -3.80549265 14.90003091 91.5432479 ]
    MSE: 72421.3744832726
    R2: -0.17280662930067647
