In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Import csv

In [2]:
df = pd.read_csv("data/consolidated_2022-10-13_v1", index_col = None)
df = df.drop(columns = 'Unnamed: 0')
df.columns = ["time", "temp", "wt", "dta", "wt_pc", "compound", "condition"]

## Test Train Split

In [3]:
def test_train_compound(df, product):
    """
    Function to split test/train data for each compound
    
    Argument:
        df      : pandas dataframe
        product : tbbpa, caoh2, both
        
    Output:
        X_train, X_test, y_train, y_test : 4 pandas dataframe which have the split data 
    """
    df = df.query('compound == @product')
    X_train, X_test, y_train, y_test = train_test_split(df[["time", "temp", "condition" ]], df["wt"], test_size=0.2, random_state=7)
    return X_train, X_test, y_train, y_test

## Modelling

### Linear Modelling

In [17]:
def linear_model(df, product = 'caoh2'):
    X_train, X_test, y_train, y_test = test_train_compound(df, product)
    print("The 4 matrices for ", product, " are of sizes: ", X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    reg = LinearRegression().fit(X_train, y_train)
    print("The accuracy for ", product, " model is ", reg.score(X_train, y_train)) 

In [38]:
for i in df.compound.value_counts().index:
    linear_model(df, i)

The 4 matrices for  tbbpa  are of sizes:  (31132, 3) (7783, 3) (31132,) (7783,)
The accuracy for  tbbpa  model is  0.8241165411318243
The 4 matrices for  caoh2  are of sizes:  (31108, 3) (7778, 3) (31108,) (7778,)
The accuracy for  caoh2  model is  0.8452718463394618
The 4 matrices for  both  are of sizes:  (31068, 3) (7767, 3) (31068,) (7767,)
The accuracy for  both  model is  0.9282599428748095
