In [2]:
import pandas as pd

# Part 1. Loading the dataset

In [3]:
url = 'https://raw.githubusercontent.com/profmcnich/example_notebook/main/science_data_large.csv'
df = pd.read_csv(url,index_col=0)
df.head(15)

Unnamed: 0_level_0,Mols KCL,Size nm^3
Temperature °C,Unnamed: 1_level_1,Unnamed: 2_level_1
469,647,624474.3
403,694,577961.0
302,975,619684.7
779,916,1460449.0
901,18,43257.26
545,637,712463.4
660,519,700696.0
143,869,271826.0
89,461,89198.03
294,776,477021.0


In [4]:
df.describe()

Unnamed: 0,Mols KCL,Size nm^3
count,1000.0,1000.0
mean,471.53,508611.1
std,288.482872,447483.8
min,1.0,16.11429
25%,226.75,129826.7
50%,459.5,382718.2
75%,710.25,760321.1
max,1000.0,1972127.0


# Part 2. Splitting Data

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

X = df.drop('Size nm^3', axis = 1)
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.9,test_size=0.1,random_state=42)
y

Temperature °C
469    6.244743e+05
403    5.779610e+05
302    6.196847e+05
779    1.460449e+06
901    4.325726e+04
           ...     
894    1.545661e+06
327    6.737041e+05
791    3.477543e+05
769    8.684794e+05
919    8.476413e+05
Name: Size nm^3, Length: 1000, dtype: float64

# Part 3. Linear Regression

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression

linRegModel = LinearRegression()
linRegModel.fit(X_train, y_train)

linRegModel.predict(X_test)

array([ 588628.99707011,  735909.22088716,  732430.16048204,
        971325.6416341 ,  651252.0843624 ,  161864.25404118,
        196654.85809245,  537602.77779491,  129393.02359332,
        657050.51837095, 1052503.71775374,  422793.78442572,
       1006116.24568537,  280152.3078155 ,  372927.25195223,
        986401.57005632,  435550.33924451,  553838.39301884,
        101560.5403523 ,  815927.61020509,  172301.43525656,
        -19046.88702544,  827524.47822218,  434390.65244281,
        476139.37730433,   92283.04593863,  922618.79596232,
        492374.99252826,  774178.88534356,  167662.68804972,
        369448.1915471 ,   65610.24949932,  194335.48448903,
        782296.69295553,  307984.79105652,  868113.516282  ,
        905223.49393669,  529484.97018295,  505131.54734706,
        579351.50265644,  750985.14930938,  474979.69050262,
        682563.62800855,   44735.88706856,  200133.91849757,
        563115.88743251,  659369.89197436,  201293.60529928,
        215209.84691979,

In [7]:
sampleInput = [[32.123123]]
samplePredict = linRegModel.predict(sampleInput)
print(samplePredict)

[-1508.80088172]


In [8]:
linRegModel.score(X_test,y_test)

0.43482881518742433

This tells me that the accuracy score of this linear regression model is no a very good way to predict the size of the slime given the combination of temperature and the amount of KCL added to the slime. I think it is bad because this is under a 50% chance that the model will predict the size of the slime correctly. For this model is work it will need a lot more data or a different kind of model to help guess the size of the slime.

In [9]:
print(linRegModel.coef_)
print(linRegModel.intercept_)

[1159.68680171]
-38761.56265449233


$$ h(x) = 1159.68680x - 38761.56265 $$

# Part 4. Cross Validation

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold

In [13]:
lin_reg = LinearRegression()
cvs = cross_val_score(lin_reg, X, y, scoring = 'r2', cv = 5)
cvs

array([0.51936072, 0.60582887, 0.49438144, 0.54422642, 0.52886197])

When using the cross value score while doing linear regression, this shows that the second chunk of the data folded 5 times gives th best guesses back. When this happens the accuracy is at around 61% which is a lot larger than just cutting the data 90:10 uniformly to train and test. The worst that can be chosen is the third chunk of the dataset folds. This may mean that having more data to test out may help increase scores.

# Part 5. Polynomial Regression

In [79]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [130]:
linModel = LinearRegression()
linModel.fit(X_train,y_train)

polyModel = PolynomialFeatures(degree=2)
xPolyTrain = polyModel.fit_transform(X_train)
xPolyTest = polyModel.fit_transform(X_test)

linModel2 = LinearRegression()
linModel2.fit(xPolyTrain,y_train)

print(linModel2.score(xPolyTest,y_test))


0.4253405141730977


In [131]:
print(linModel2.coef_)

[0.00000000e+00 9.69719945e+02 1.96712730e-01]


In [132]:
print(linModel2.intercept_)

-9451.934865508694


$$ h(x) = 969.71994x + .19671x^2 - 9451.93486 $$

The polynomial regression model did worse than the linear regression model. Linear regression models do not seem to perform too well for this dataset. Maybe the data in this dataset is not very good or the size of the slimes are jsut too hard to predict. For the Polynomial Regression the score for it was around a 43%.