In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Load the dataset
file_path = ''  # Update the path accordingly
data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Strip any leading/trailing whitespace from column names
data.columns = data.columns.str.strip()


In [None]:
# Define categorical features for label encoding
label_features = [
    'Spincoating Speed', 'Antisolvent Used', 'Substrates preheated Temperature',
    'Solution preheated temperature'
]

# Encode categorical variables with NA as a category
label_encoders = {}
for column in label_features:
    data[column] = data[column].fillna('NA')  # Treat NA as a valid category
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column].astype(str))
    label_encoders[column] = le

# Feature engineering: Polynomial features (optional, based on need)
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(data[label_features])
poly_feature_names = poly.get_feature_names_out(label_features)
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)

# Impute missing values in polynomial features (except for specific target columns)
imputer = SimpleImputer(strategy='mean')
poly_df_imputed = imputer.fit_transform(poly_df)
poly_df = pd.DataFrame(poly_df_imputed, columns=poly_feature_names)

# Concatenate polynomial features with original data
data = pd.concat([data.reset_index(drop=True), poly_df.reset_index(drop=True)], axis=1)

# Define features and targets
features = poly_feature_names
target_coverage = 'Coverage Percentage'
target_avg_size = 'Average Size um'

# Drop rows where the target variable is missing
data_cov = data.dropna(subset=[target_coverage])
data_size = data.dropna(subset=[target_avg_size])  # Do not impute empty cells; just drop

# Split the data into training and testing sets
X_cov = data_cov[features]
y_cov = data_cov[target_coverage]

X_size = data_size[features]
y_size = data_size[target_avg_size]

X_train_cov, X_test_cov, y_train_cov, y_test_cov = train_test_split(X_cov, y_cov, test_size=0.2, random_state=42)
X_train_size, X_test_size, y_train_size, y_test_size = train_test_split(X_size, y_size, test_size=0.2, random_state=42)

# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [300, 400, 500],
    'max_features': ['log2'],
    'max_depth': [None],
    'min_samples_split': [10, 15, 20]
}

# Initialize GridSearchCV objects for both targets
grid_search_cov = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search_size = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)

# Perform grid search for Coverage Percentage
grid_search_cov.fit(X_train_cov, y_train_cov)

# Best parameters and model for Coverage Percentage
best_rf_cov = grid_search_cov.best_estimator_

# Perform grid search for Average Size scaled
grid_search_size.fit(X_train_size, y_train_size)

# Best parameters and model for Average Size scaled
best_rf_size = grid_search_size.best_estimator_



In [None]:
# Predict on the training and test sets
y_train_pred_cov = best_rf_cov.predict(X_train_cov)
y_test_pred_cov = best_rf_cov.predict(X_test_cov)

y_train_pred_size = best_rf_size.predict(X_train_size)
y_test_pred_size = best_rf_size.predict(X_test_size)

# Calculate RMSE for training and testing sets
train_rmse_cov = np.sqrt(mean_squared_error(y_train_cov, y_train_pred_cov))
test_rmse_cov = np.sqrt(mean_squared_error(y_test_cov, y_test_pred_cov))

train_rmse_size = np.sqrt(mean_squared_error(y_train_size, y_train_pred_size))
test_rmse_size = np.sqrt(mean_squared_error(y_test_size, y_test_pred_size))

# Save predicted values and actual values for plotting
results_cov = {
    'y_train_actual': y_train_cov,
    'y_train_predicted': y_train_pred_cov,
    'y_test_actual': y_test_cov,
    'y_test_predicted': y_test_pred_cov,
    'train_rmse': train_rmse_cov,
    'test_rmse': test_rmse_cov
}

results_size = {
    'y_train_actual': y_train_size,
    'y_train_predicted': y_train_pred_size,
    'y_test_actual': y_test_size,
    'y_test_predicted': y_test_pred_size,
    'train_rmse': train_rmse_size,
    'test_rmse': test_rmse_size
}

# Save the results as joblib files for later use
joblib.dump(results_cov, 'results_coverage.joblib')
joblib.dump(results_size, 'results_size.joblib')

print("Results saved for parity plot generation.")


In [None]:
# Load the saved results
results_cov = joblib.load('results_coverage.joblib')
results_size = joblib.load('results_size.joblib')

def remove_outliers(y_actual, y_predicted, threshold=3):
    """Remove outliers based on residuals."""
    residuals = y_actual - y_predicted
    z_scores = np.abs((residuals - np.mean(residuals)) / np.std(residuals))
    mask = z_scores < threshold
    return y_actual[mask], y_predicted[mask]

# Remove outliers for Coverage Percentage
y_train_cov_actual_filtered, y_train_cov_predicted_filtered = remove_outliers(
    results_cov['y_train_actual'], results_cov['y_train_predicted']
)
y_test_cov_actual_filtered, y_test_cov_predicted_filtered = remove_outliers(
    results_cov['y_test_actual'], results_cov['y_test_predicted']
)

# Parity Plot for Coverage Percentage
plt.figure(figsize=(8, 6))
plt.scatter(
    y_train_cov_actual_filtered, y_train_cov_predicted_filtered,
    color='blue', alpha=0.6,
    label=f'Train RMSE: {results_cov["train_rmse"]:.2f}', marker='o'
)
plt.scatter(
    y_test_cov_actual_filtered, y_test_cov_predicted_filtered,
    color='red', alpha=0.6,
    label=f'Test RMSE: {results_cov["test_rmse"]:.2f}', marker='o'
)
plt.plot([0, 25], [0, 25], 'k--', lw=2)  # Diagonal line within axis limits
plt.xlim(0, 25)  # Set x-axis limits
plt.ylim(0, 25)  # Set y-axis limits
plt.xlabel('Actual Coverage Percentage')
plt.ylabel('Predicted Coverage Percentage')
plt.title('Parity Plot - Coverage Percentage (Train vs Test)')
plt.legend()
plt.tight_layout()
plt.show()

# Similarly for Average Size Scaled
y_train_size_actual_filtered, y_train_size_predicted_filtered = remove_outliers(
    results_size['y_train_actual'], results_size['y_train_predicted']
)
y_test_size_actual_filtered, y_test_size_predicted_filtered = remove_outliers(
    results_size['y_test_actual'], results_size['y_test_predicted']
)

# Parity Plot for Average Size Scaled
plt.figure(figsize=(8, 6))
plt.scatter(
    y_train_size_actual_filtered, y_train_size_predicted_filtered,
    color='blue', alpha=0.6,
    label=f'Train RMSE: {results_size["train_rmse"]:.2f}', marker='o'
)
plt.scatter(
    y_test_size_actual_filtered, y_test_size_predicted_filtered,
    color='red', alpha=0.6,
    label=f'Test RMSE: {results_size["test_rmse"]:.2f}', marker='o'
)
plt.plot([0, 0.3], [0, 0.3], 'k--', lw=2)  # Diagonal line within axis limits
plt.xlim(0, 0.3)  # Set x-axis limits
plt.ylim(0, 0.3)  # Set y-axis limits
plt.xlabel('Actual Average Size Scaled')
plt.ylabel('Predicted Average Size Scaled')
plt.title('Parity Plot - Average Size Scaled (Train vs Test)')
plt.legend()
plt.tight_layout()
plt.show()
