# Melting Point Prediction - Exploratory Data Analysis

This notebook demonstrates the exploratory data analysis and model training pipeline for melting point prediction.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Import project modules
import sys
sys.path.append('..')
from src.data_loader import MeltingPointDataLoader
from src.models import BaseModel, get_default_model_configs
from src.train import TrainingPipeline

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Load and Explore Data

In [None]:
# Load training data
train_df = pd.read_csv('../data/train.csv')

print(f"Training data shape: {train_df.shape}")
print(f"\nColumns: {list(train_df.columns)}")
print(f"\nFirst few rows:")
train_df.head()

In [None]:
# Basic statistics
train_df.describe()

## 2. Analyze Target Variable

In [None]:
# Distribution of melting points
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(train_df['melting_point'], bins=30, edgecolor='black')
axes[0].set_xlabel('Melting Point (°C)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Melting Points')

axes[1].boxplot(train_df['melting_point'])
axes[1].set_ylabel('Melting Point (°C)')
axes[1].set_title('Box Plot of Melting Points')

plt.tight_layout()
plt.show()

print(f"Melting Point Statistics:")
print(f"Mean: {train_df['melting_point'].mean():.2f}°C")
print(f"Median: {train_df['melting_point'].median():.2f}°C")
print(f"Std Dev: {train_df['melting_point'].std():.2f}°C")
print(f"Min: {train_df['melting_point'].min():.2f}°C")
print(f"Max: {train_df['melting_point'].max():.2f}°C")

## 3. Analyze Features

In [None]:
# Get feature columns
feature_cols = [col for col in train_df.columns if col not in ['id', 'melting_point']]

# Check for missing values
missing_values = train_df[feature_cols].isnull().sum()
print(f"Missing values per feature:")
print(missing_values[missing_values > 0])

if missing_values.sum() == 0:
    print("No missing values found!")

In [None]:
# Correlation with target
correlations = train_df[feature_cols + ['melting_point']].corr()['melting_point'].drop('melting_point')
correlations = correlations.sort_values(ascending=False)

plt.figure(figsize=(10, 6))
correlations.plot(kind='bar')
plt.xlabel('Features')
plt.ylabel('Correlation with Melting Point')
plt.title('Feature Correlations with Melting Point')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print(f"\nTop 5 positively correlated features:")
print(correlations.head())
print(f"\nTop 5 negatively correlated features:")
print(correlations.tail())

## 4. Train Models

In [None]:
# Initialize training pipeline
pipeline = TrainingPipeline(
    train_path='../data/train.csv',
    test_path='../data/test.csv',
    model_dir='../models'
)

# Load and prepare data
X_train, X_val, y_train, y_val = pipeline.load_and_prepare_data()

In [None]:
# Train all models
results = pipeline.train_all_models(X_train, y_train, X_val, y_val)

## 5. Compare Model Performance

In [None]:
# Extract validation metrics
model_names = []
val_maes = []
val_rmses = []
val_r2s = []

for model_name, result in results.items():
    model_names.append(model_name)
    val_maes.append(result['val_metrics']['mae'])
    val_rmses.append(result['val_metrics']['rmse'])
    val_r2s.append(result['val_metrics']['r2'])

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': model_names,
    'Val MAE': val_maes,
    'Val RMSE': val_rmses,
    'Val R²': val_r2s
})

comparison_df = comparison_df.sort_values('Val MAE')
print(comparison_df.to_string(index=False))

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].barh(comparison_df['Model'], comparison_df['Val MAE'])
axes[0].set_xlabel('MAE')
axes[0].set_title('Validation MAE by Model')
axes[0].invert_yaxis()

axes[1].barh(comparison_df['Model'], comparison_df['Val RMSE'])
axes[1].set_xlabel('RMSE')
axes[1].set_title('Validation RMSE by Model')
axes[1].invert_yaxis()

axes[2].barh(comparison_df['Model'], comparison_df['Val R²'])
axes[2].set_xlabel('R²')
axes[2].set_title('Validation R² by Model')
axes[2].invert_yaxis()

plt.tight_layout()
plt.show()

## 6. Create Ensemble and Generate Predictions

In [None]:
# Create ensemble model
ensemble = pipeline.create_ensemble(
    model_types=list(results.keys()),
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val
)

In [None]:
# Generate predictions on test set
submission = pipeline.generate_predictions(
    model_name='ensemble',
    output_path='../submission.csv'
)

print(f"\nSubmission file created!")
print(f"\nFirst few predictions:")
submission.head(10)

## 7. Analyze Predictions

In [None]:
# Plot prediction distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(submission['melting_point'], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Predicted Melting Point (°C)')
plt.ylabel('Frequency')
plt.title('Distribution of Predicted Melting Points')

plt.subplot(1, 2, 2)
plt.hist(train_df['melting_point'], bins=30, edgecolor='black', alpha=0.7, label='Training')
plt.hist(submission['melting_point'], bins=30, edgecolor='black', alpha=0.7, label='Predictions')
plt.xlabel('Melting Point (°C)')
plt.ylabel('Frequency')
plt.title('Training vs Predicted Distribution')
plt.legend()

plt.tight_layout()
plt.show()

## Summary

This notebook demonstrates:
1. Loading and exploring the melting point dataset
2. Analyzing the target variable and feature correlations
3. Training multiple machine learning models
4. Comparing model performance using MAE, RMSE, and R² metrics
5. Creating an ensemble model
6. Generating predictions for submission

The best model can be selected based on validation MAE, and the ensemble often provides improved performance.