In [29]:
# linear regression
# y = wx+b where w is weight, b is bias
# in ml they are parameters which machine finds to best fit our data

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


NB_DIR = Path.cwd()
PROJECT_ROOT = NB_DIR.parent

RAW_DATA_DIR = PROJECT_ROOT / 'data' / 'raw'
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)

xls_path = RAW_DATA_DIR / 'Concrete_Data.xls'
df = pd.read_excel(xls_path)



In [30]:
# prep data

df.rename(columns = {
  'Cement (component 1)(kg in a m^3 mixture)': 'cement',
  'Blast Furnace Slag (component 2)(kg in a m^3 mixture)': 'blast_furnace_slag',
  'Fly Ash (component 3)(kg in a m^3 mixture)': 'fly_ash',
  'Water  (component 4)(kg in a m^3 mixture)': 'water',
  'Superplasticizer (component 5)(kg in a m^3 mixture)': 'superplasticizer',
  'Coarse Aggregate  (component 6)(kg in a m^3 mixture)': 'coarse_aggregate',
  'Fine Aggregate (component 7)(kg in a m^3 mixture)': 'fine_aggregate',
  'Age (day)': 'age',
  'Concrete compressive strength(MPa, megapascals) ': 'compressive_strength'
}, inplace = True)

df = (df[df['age'] == 28]).copy()
df.drop(columns = ['age'], inplace = True)
df.sample(7)

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,compressive_strength
394,405.0,0.0,0.0,175.0,0.0,1120.0,695.0,52.303649
879,313.0,145.0,0.0,178.0,8.0,867.0,824.0,44.388465
589,339.2,0.0,0.0,185.7,0.0,1069.2,754.3,31.899986
862,140.0,164.0,128.0,237.0,6.0,869.0,656.0,35.225329
919,313.0,0.0,0.0,178.0,8.0,1000.0,822.0,25.096926
390,450.1,50.0,0.0,200.0,3.0,1124.4,613.2,39.375974
884,300.0,0.0,120.0,212.0,10.0,878.0,728.0,23.835185


In [31]:
# define features and target

column_names = df.columns

target = 'compressive_strength'
features = column_names[column_names != target]

# y = wX + b
X = df[features] # feature vectors
y = df[target] # target

# split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42) # to get predictability in all instances of the model

print(f"Training Set: {len(y_train)}/{len(df)}")
print(f"Testing Set: {len(y_test)}/{len(df)}")

# use model = LiR

# Given the values of X and y, solve for w and b
model = LinearRegression().fit(X_train, y_train)

weights = model.coef_
intercept = model.intercept_

print(f'Weights: {weights}')
print(f'Intercept: {intercept}')

score = model.score(X_test, y_test)

print(f'Coefficient of Determination (R Squared) = {score:.3f}')

# R2 value will be if we use X_train and y_train? (train came out 0.782, what it means?)
# train score is mostly higher because The model "sees" the training data during its learning process, so it naturally fits that data better than "unseen" test data

# small gap in this suggests a balanced model, a high test R2 suggest 'overtraining'
# check other metrics like RMSE or k-fold-cross

Training Set: 340/425
Testing Set: 85/425
Weights: [ 0.17303948  0.15083341  0.10976379 -0.04728941  0.13121105  0.04160992
  0.05862292]
Intercept: -105.99063240392441
Coefficient of Determination (R Squared) = 0.736


In [None]:
# use plots/graphs for further analysis