# Thermophysical Property Analysis: Melting Point Prediction

## Goal
Predict the melting point (`Tm`) of organic molecules using group contribution features.

## Data
- `train.csv`: Features + Target (`Tm`)
- `test.csv`: Features only
- `sample_submission.csv`: Submission format

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Display settings
pd.set_option('display.max_columns', None)

## 1. Load Data

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

In [None]:
train_df.head()

## 2. EDA

In [None]:
# Target Distribution
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Tm'], kde=True)
plt.title('Distribution of Melting Point (Tm)')
plt.xlabel('Tm (K)')
plt.show()

In [None]:
# Check for missing values
print("Missing values in Train:", train_df.isnull().sum().sum())
print("Missing values in Test:", test_df.isnull().sum().sum())

## 3. Baseline Model
Using Random Forest on Group features.

In [None]:
# Identify features (exclude id, SMILES, Tm)
features = [c for c in train_df.columns if c not in ['id', 'SMILES', 'Tm']]
target = 'Tm'

X = train_df[features]
y = train_df[target]
X_test = test_df[features]

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training with {len(features)} features.")

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

val_preds = model.predict(X_val)
mae = mean_absolute_error(y_val, val_preds)

print(f"Validation MAE: {mae:.4f}")

## 4. Submission

In [None]:
test_preds = model.predict(X_test)
submission_df['Tm'] = test_preds
submission_df.to_csv('submission.csv', index=False)
print("Saved submission.csv")

In [None]:
submission_df.head()