In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Import necessary libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
import altair as alt

# Load the original dataset (update path to your file location in Google Drive)
file_path = ''  ## add your dataset here
data = pd.read_csv(file_path, encoding='ISO-8859-1')

additional_data = pd.DataFrame({
    'Composition': ['AgBiI4','AgBiI4', 'AgBiI4', 'AgBiI4','AgBiI4'],  # Keep the composition consistent
    'Spincoating Speed': ['6000rpm 30s', '6000rpm 30s','1000rpm 2s, 6000rpm 30s','1000rpm 2s, 6000rpm 30s', '6000rpm 30s'],
    'Substrates preheated Temperature (°C)': [140, 120, 110,110, 100],
    'Antisolvent Used': ['NA', 'NA', 'CHLOROBENZENE', 'TOLUENE','NA'],
    'Solution preheated Temperature': [140,120, 110, 110, 100],
    'Legend': ['RL_best', 'RL_2', 'RL_within', 'Daisy_Best', 'RL_3']  # Updated labels
})

# Add a "Legend" column to the original data with 'Original' as its default value
data['Legend'] = 'Original'

# Combine original data with additional data points
data_combined = pd.concat([data, additional_data], ignore_index=True)

# Ensure all columns, including temperature, are treated as categorical
data_combined = data_combined.astype(str)

# One-hot encode categorical variables, including 'NA' as a distinct category
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
data_encoded = encoder.fit_transform(data_combined.drop(columns=['Legend']))  # Exclude Legend from encoding

# Step 4: Apply PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_encoded)

# Add PCA results and legend labels to the combined data
pca_df = pd.DataFrame(data_pca, columns=['PCA1', 'PCA2'])
pca_df['Label'] = data_combined['Composition']
pca_df['Legend'] = data_combined['Legend']  # Include custom labels for legend

# Step 5: Visualize in Altair
# Base chart for original data (all same color, no legend)
base_chart = alt.Chart(pca_df[pca_df['Legend'] == 'Original']).mark_circle(size=60, color='lightblue').encode(
    x='PCA1',
    y='PCA2',
    tooltip=['PCA1', 'PCA2', 'Label']
)

# Chart for additional data with unique colors and shapes, including legend
highlight_chart = alt.Chart(pca_df[pca_df['Legend'] != 'Original']).mark_circle(size=100).encode(
    x='PCA1',
    y='PCA2',
    color=alt.Color('Legend:N', scale=alt.Scale(scheme='category10'), title='Additional Points'),
    shape=alt.Shape('Legend:N', scale=alt.Scale(domain=['RL_best', 'RL_2', 'RL_3'], range=['square', 'triangle', 'diamond'])),
    tooltip=['PCA1', 'PCA2', 'Legend']
)

# Combine both charts
final_chart = base_chart + highlight_chart

# Display the chart with all compositions shown in the background, only additional points in legend
final_chart.properties(
    title='2D PCA Projection with All Data',
    width=600,
    height=400
).interactive()