# **Materials Analysis**

# **1. Data Loading**




In [None]:
import kagglehub
import numpy as np
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# --- 1. Download & Load Data ---
# Download dataset
path = kagglehub.dataset_download("allanwandia/material-science")

# Find the CSV file in the download folder
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
full_path = os.path.join(path, csv_files[0])

df = pd.read_csv(full_path)

# **2. Checking data information**




#Basic command for checking data information

```
df.info()
df.head()
df.tail()
df.describe()
df.columns

In [None]:
df.info()

# Check for missing values

In [None]:
# Check for missing values
print('Missing values in each column:')
print(df.isnull().sum())



If any numeric columns have missing values, fill them with the median

In [None]:
# Basic cleaning: if any numeric columns have missing values, fill them with the median
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)

# For any categorical columns with missing values, fill with 'Unknown'
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna('Unknown', inplace=True)

# Final shape after cleaning
print('Data shape after cleaning:', df.shape)

In [None]:
df.tail()

In [None]:
df.describe()

# Exploratory Data Analysis

**Count Plot for boolean columns**

In [None]:
## Count Plot for boolean columns
bool_cols = df.select_dtypes(include=['bool']).columns
for col in bool_cols:
    plt.figure(figsize=(6, 4))
    sns.countplot(x=df[col])
    plt.title(f'Count Plot of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()

In [None]:
sns.pairplot(numeric_df)
plt.suptitle('Pair Plot for Numeric Features', y=1.02)
plt.show()

# **3. Prediction Modeling**

# Define target variable and select feature set


In [None]:
df['is_semiconductor'].head(5)

In [None]:
# Target: Convert True/False to 1/0

df_class = df.copy()
df_class['target'] = df_class['is_semiconductor'].astype(int)
df_class['target'].head(5)

# Features: Physical properties

In [None]:
# Features: Physical properties
features = ['density', 'formation_energy_per_atom', 'volume', 'n_elements']
X = df_class[features]
y = df_class['target']

# Split Data

In [None]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Model

In [None]:
# Train Model ---
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

print(f"Classifier trained on {len(X_train)} materials.")

# **4. Model Evaluation**

In [None]:
# Evaluate ---
y_pred = classifier.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc*100:.2f}%")

In [None]:
# Visualization (Confusion Matrix) ---
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Metal', 'Semiconductor'],
            yticklabels=['Metal', 'Semiconductor'])
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()



# **5. Model comparison**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# ... (Imports and Data Split code remains the same) ...

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

# --- TRACKING VARIABLES ---
best_model = None
best_score = 0.0
best_name = ""

print(f"{'Model Name':<25} | {'Accuracy':<10}")
print("-" * 40)

for name, model in models.items():

    # Train
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"{name:<25} | {acc:.4f}")

    # --- CHECK IF THIS IS THE NEW CHAMPION ---
    if acc > best_score:
        best_score = acc
        best_model = model
        best_name = name

print("-" * 40)
print(f"üèÜ The Winner is: {best_name} with {best_score:.4f} accuracy!")

# --- SAVE THE CHAMPION ---
joblib.dump(best_model, 'best_materials_model.pkl')
print(f"Saved {best_name} to 'best_materials_model.pkl'")

# **6. Feature importances**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 1. Get the importance scores
# Note: This only works for Random Forest / Gradient Boosting / Decision Trees
importances = best_model.feature_importances_

# 2. Organize them into a clean table
# 'features' is the list you defined at the very beginning:
# ['density', 'formation_energy_per_atom', 'volume', 'n_elements']
feature_table = pd.DataFrame({
    'Feature': features,
    'Importance': importances
})

# 3. Sort by importance (Highest on top)
feature_table = feature_table.sort_values(by='Importance', ascending=False)

print(feature_table)

# 4. (Optional) Visualize it!
plt.figure(figsize=(8, 4))
plt.barh(feature_table['Feature'], feature_table['Importance'], color='cornflowerblue')
plt.xlabel('Importance Score (0 to 1)')
plt.title('What mattered most to the model?')
plt.gca().invert_yaxis() # Put the most important at the top
plt.show()

# **7. Load the best model**

In [None]:
import joblib
import pandas as pd

# 1. LOAD the model
loaded_model = joblib.load('best_materials_model.pkl')

# 2. PREPARE INPUT
# Define the raw values
['density' 'formation_energy_per_atom' 'volume' 'n_elements']

raw_data = [[5.0, -1.2, 100, 3]]

# FIX: Create a DataFrame using the feature names stored in the model
# This tells sklearn exactly which value corresponds to which feature
new_material_df = pd.DataFrame(raw_data, columns=loaded_model.feature_names_in_)

# 3. PREDICT (Warning will be gone)
prediction = loaded_model.predict(new_material_df)

print("Prediction:",
      "üü¢ Semiconductor" if prediction[0] == 1 else "üî¥ Not Semiconductor")
