# Exploratory Data Analysis for Concrete Strength Prediction

### Importing Necessary Libraries

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

### Loading Data

In [None]:
pd.set_option("display.max_columns",None)
data_path="C:/Users/Bala/Projects/Internship/ConcreteStrength/raw_data/concrete_data.csv"
df=pd.read_csv(data_path)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

### Observations

1. This Dataset consists of 9 features listed below : 
     * cement - kg in a m3 mixture
     * blast_furnace_slag - kg in a m3 mixture
     * fly_ash - kg in a m3 mixture
     * water - kg in a m3 mixture
     * superplasticizer - kg in a m3 mixture
     * coarse_aggregate - kg in a m3 mixture
     * coarse_aggregate - kg in a m3 mixture
     * fine_aggregate - kg in a m3 mixture
     * age - Day (1~365)
     * concrete_compressive_strength - MPa
2. The data type of each feature is shown above
3. 1030 records are present in this dataset

In [None]:
report=ProfileReport(df)
report.to_widgets()

In [None]:
report.to_file("ProfileReport.html")

### Observations from Profile Report

1. There are no missing values present in the dataset
2. Zeros are present in blast_furnace_slag, fly_ash and superplasticizer
3. No categorical Values is present in the dataset
4. Duplicates are present in the dataset
5. No columns with zero standard deviation 

### Handling duplicates

In [None]:
df.drop_duplicates(inplace=True)
print(df.shape[0])

### Analysis and Visualization

Distribution of Each Feature

In [None]:
for feature in df.columns:
    sns.displot(df[feature])
    plt.xlabel(feature)
    plt.title(feature.title())
    plt.show()

In [None]:
sns.pairplot(df)

Correleation between each feature

In [None]:
ax = sns.heatmap(df.corr(), annot=True,cmap='YlGnBu')
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

### Observations
1. Cement is more correlated with the target variable than other features

### Outliers

In [None]:
for feature in df.columns:
    data=df.copy()
    data[feature]=data[feature]
    data.boxplot(column=feature)
    plt.ylabel(feature)
    plt.title(feature)
    plt.show()

### Observations

1. The following features have outliers:
    * Age
    * Fine Aggregate
    * Water
    * Superplasticizer
    * Blast Furnace Slag

In [None]:
def detect_outliers_iqr(data):
    outliers = []
    data = sorted(data)
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    IQR = q3-q1
    lwr_bound = q1-(1.5*IQR)
    upr_bound = q3+(1.5*IQR)
    for i in data: 
        if (i<lwr_bound or i>upr_bound):
            outliers.append(i)
    return outliers

In [None]:
outliers_column=[]
for feature in df.columns:
    sample_outliers = detect_outliers_iqr(df[feature])
    if len(sample_outliers)>0:
        print(f"{feature}", len(sample_outliers))
        outliers_column.append(feature)

In [None]:
print(outliers_column[-1])

Outlier Handling in feature column

In [None]:
for feature in outliers_column[:-1]:
    tenth_percentile = np.percentile(df[feature], 10)
    ninetieth_percentile = np.percentile(df[feature], 90)
    df.loc[df[feature]<tenth_percentile,feature]=tenth_percentile
    df.loc[df[feature]>ninetieth_percentile,feature]=ninetieth_percentile

Outlier Handling in target column

In [None]:
sample_outliers = detect_outliers_iqr(df[outliers_column[-1]])
outliers_index=[]
for outlier in sample_outliers:
    outliers_index.append(df[df[outliers_column[-1]]==outlier].index.values[0])
df.drop(df.index[outliers_index],axis=0,inplace=True)
print(df.shape[0])