# WIGE Hackathon 2025 - Contact Angle Prediction
## Carbon Capture, Utilization, and Storage (CCUS) Assessment

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')


In [None]:

# Set random seed for reproducibility
np.random.seed(42)

# 1. Data Loading and Initial Exploration

In [None]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

# Display basic information
print("Training set shape:", train_df.shape)
print("\nTest set shape:", test_df.shape)
print("\nTraining set info:")
train_df.info()

## 2. Exploratory Data Analysis

In [None]:
def plot_feature_distributions(df):
    numerical_cols = ['pressure', 'temperature', 'salinity', 'theta0']
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    for i, col in enumerate(numerical_cols):
        sns.histplot(data=df, x=col, ax=axes[i//2, i%2])
    plt.tight_layout()
    plt.show()

# Plot distributions
plot_feature_distributions(train_df)



In [None]:
# Select only numerical columns for correlation analysis
numerical_cols = ['pressure', 'temperature', 'salinity', 'theta0', 'contact_angle']
plt.figure(figsize=(10, 8))
sns.heatmap(train_df[numerical_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlations (Numerical Variables)')
plt.show()

## 3. Feature Engineering

In [None]:
def engineer_features(df):
    # Created a copy to avoid modifying original
    df = df.copy()

    # Encode categorical variables
    le_mineral = LabelEncoder()
    le_contact = LabelEncoder()

    df['mineral_encoded'] = le_mineral.fit_transform(df['mineral'])
    df['contact_type_encoded'] = le_contact.fit_transform(df['contact_type'])

    # Create interaction features
    df['pressure_temp'] = df['pressure'] * df['temperature']
    df['salinity_temp'] = df['salinity'] * df['temperature']

    return df

# Apply feature engineering
train_engineered = engineer_features(train_df)
test_engineered = engineer_features(test_df)

## 4. Model Development and Evaluation

In [None]:
def train_evaluate_model(X, y, model, model_name):
    # K-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring='r2')

    print(f"{model_name} R² scores: {scores}")
    print(f"Average R²: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

    return model.fit(X, y)

# Prepare features
feature_cols = ['pressure', 'temperature', 'salinity', 'mineral_encoded',
               'contact_type_encoded', 'theta0', 'pressure_temp', 'salinity_temp']

X_train = train_engineered[feature_cols]
y_train = train_engineered['contact_angle']
X_test = test_engineered[feature_cols]

# Train models
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42)
}

trained_models = {}
for name, model in models.items():
    trained_models[name] = train_evaluate_model(X_train, y_train, model, name)

## 5. Generate Predictions and Submission File

In [None]:
# Create ensemble predictions
predictions = np.zeros(len(X_test))
for model in trained_models.values():
    predictions += model.predict(X_test)
predictions /= len(trained_models)



In [None]:
# Display the first few rows of the submission file to show the predictions
print("First 5 rows of the submission file with predictions:")
display(submission_df.head())

In [None]:
# Create submission file
submission_df['contact_angle'] = predictions
submission_df.to_csv('submission.csv', index=False)
print("Submission file created successfully!")