In [1]:
import polars as pl
import numpy as np
import pandas as pd
import statsmodels as sm
import scipy
import plotly.express as px
import sklearn as sk

In [2]:
df = pl.from_pandas(pd.read_excel("data/Concrete_Data.xls"))
df.columns = ["Cement", "Blast Furnace Slag", "Fly Ash", "Water", "Superplasticizer", "Coarse Aggregate", "Fine Aggregate", "Age", "Concrete compressive strength"]
# pl.read_excel("Concrete_data.xls")
df = df.to_pandas()
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Concrete compressive strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [3]:
## plot all columns of df as line graphs into one figure with plotly
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
fig = px.box(df_train)
fig.show()

In [4]:
# make a scatter plot matrix of all columns of df_train
fig = px.scatter_matrix(df_train)
fig.show()

In [5]:
# make a correlation matrix of all columns of df_train
corr_matrix = df_train.corr()
corr_matrix

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Concrete compressive strength
Cement,1.0,-0.311617,-0.395015,-0.107325,0.090642,-0.066147,-0.225082,0.083939,0.496769
Blast Furnace Slag,-0.311617,1.0,-0.309704,0.106491,0.052912,-0.300054,-0.251836,-0.047295,0.144354
Fly Ash,-0.395015,-0.309704,1.0,-0.254854,0.369905,-0.009785,0.062927,-0.147909,-0.137479
Water,-0.107325,0.106491,-0.254854,1.0,-0.655838,-0.156552,-0.416853,0.258548,-0.301306
Superplasticizer,0.090642,0.052912,0.369905,-0.655838,1.0,-0.275609,0.207509,-0.172062,0.376956
Coarse Aggregate,-0.066147,-0.300054,-0.009785,-0.156552,-0.275609,1.0,-0.221254,-0.010228,-0.172517
Fine Aggregate,-0.225082,-0.251836,0.062927,-0.416853,0.207509,-0.221254,1.0,-0.135286,-0.149146
Age,0.083939,-0.047295,-0.147909,0.258548,-0.172062,-0.010228,-0.135286,1.0,0.327808
Concrete compressive strength,0.496769,0.144354,-0.137479,-0.301306,0.376956,-0.172517,-0.149146,0.327808,1.0


In [6]:
# normalize data in df_train
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_train)
df_scaled = pd.DataFrame(df_scaled, columns=df_train.columns)
df_scaled.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Concrete compressive strength
0,-0.828635,-0.855292,0.762033,-0.765446,0.224201,0.41555,1.676831,-0.29298,-0.621033
1,0.374843,-0.855292,-0.816902,0.103778,-1.013764,1.136985,0.141922,-0.633845,-1.10842
2,0.317587,1.568944,-0.816902,-1.234582,1.352934,-1.551189,1.358359,-0.698772,-0.652239
3,0.688826,-0.638537,1.397879,-1.314978,0.791878,-0.405305,0.366726,-0.698772,-0.745211
4,-1.130428,1.31226,1.507833,-0.132681,2.1308,-1.73091,-0.382619,-0.29298,-0.142679


In [7]:
fig = px.box(df_scaled)
fig.show()

In [8]:
# do PCA on df_train_scaled
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(df_scaled)
X_pca = pca.transform(df_scaled)
print(X_pca.shape)
X_pca = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
X_pca.head()

fig = px.scatter(X_pca, x="PC1", y="PC2")
fig.show()

(721, 2)


In [9]:
## UNTESTED!!!
# scale the test data
scaler = StandardScaler()
df_test_scaled = scaler.fit_transform(df_test)
df_test_scaled = pd.DataFrame(df_scaled, columns=df_test.columns)
# df_test_scaled.head()
# do pca on df_train
pca_test = PCA(n_components=2)
pca_test.fit(df_test_scaled)
X_pca = pca_test.transform(df_test_scaled)
print(X_pca.shape)
X_pca = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
X_pca.head()

fig = px.scatter(X_pca, x="PC1", y="PC2")
fig.show()

(721, 2)
