# Case 3: User KGI 04

## Import Dependencies

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Data Loading

In [None]:
PATH = './Dataset/MiningProcess_Flotation_Plant_Database.csv'

df = pd.read_csv(PATH, parse_dates=['date'])
print("Dataset loaded successfully!")

Dataset loaded successfully!


## Basic Data Exploration

In [None]:
print("\n--- Data Exploration ---")
print(f"Dataset Shape: {df.shape}")
print("First 5 rows:")
print(df.head())

print("\nBasic Statistics:")
print(df.describe())

print("\nMissing Values:")
missing_values = df.isnull().sum()
print(missing_values)
print(f"Missing Values Percentage:")
print((missing_values / len(df)) * 100)


--- Data Exploration ---
Dataset Shape: (737453, 24)
First 5 rows:
                 date % Iron Feed % Silica Feed Starch Flow Amina Flow  \
0 2017-03-10 01:00:00        55,2         16,98     3019,53    557,434   
1 2017-03-10 01:00:00        55,2         16,98     3024,41    563,965   
2 2017-03-10 01:00:00        55,2         16,98     3043,46    568,054   
3 2017-03-10 01:00:00        55,2         16,98     3047,36    568,665   
4 2017-03-10 01:00:00        55,2         16,98     3033,69    558,167   

  Ore Pulp Flow Ore Pulp pH Ore Pulp Density Flotation Column 01 Air Flow  \
0       395,713     10,0664             1,74                      249,214   
1       397,383     10,0672             1,74                      249,719   
2       399,668      10,068             1,74                      249,741   
3       397,939     10,0689             1,74                      249,917   
4       400,254     10,0697             1,74                      250,203   

  Flotation Column 02 Ai

## Data Visualization

In [None]:
print("\n--- Data Visualization ---")

# Distribution of target variable
plt.figure(figsize=(10, 6))
sns.histplot(df['% Silica'].dropna(), kde=True)
plt.title('Distribution of % Silica (Target)')
plt.savefig('silica_distribution.png')
print("Target distribution plotted and saved as 'silica_distribution.png'")

# Correlation matrix for numerical features
plt.figure(figsize=(12, 10))
# Get only numeric columns
numeric_df = df.select_dtypes(include=[np.number])
corr_matrix = numeric_df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
print("Correlation matrix plotted and saved as 'correlation_matrix.png'")

# Time Series plot of Silica percentage
plt.figure(figsize=(15, 6))
plt.plot(df['date'], df['% Silica'])
plt.title('Silica Percentage Over Time')
plt.xlabel('Date')
plt.ylabel('% Silica')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('silica_time_series.png')
print("Time series plotted and saved as 'silica_time_series.png'")

# Scatter plots of key features vs target
features_to_plot = ['% Iron Feed', '% Silica Feed', 'Starch Flow', 'Amina Flow', 'Ore Pulp pH', 'Ore Pulp Density']
plt.figure(figsize=(20, 15))

for i, feature in enumerate(features_to_plot):
    plt.subplot(3, 2, i+1)
    plt.scatter(df[feature], df['% Silica'], alpha=0.5)
    plt.title(f'{feature} vs % Silica')
    plt.xlabel(feature)
    plt.ylabel('% Silica')

plt.tight_layout()
plt.savefig('feature_scatter_plots.png')