In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [18]:
df = pd.read_csv("data.csv")
required_columns = ['Species', 'Population', 'Temperature', 'Vial', 'Replicate',
                    'Sex', 'Thorax_length', 'l2', 'l3p', 'l3d', 'lpd', 'l3',
                    'w1', 'w2', 'w3', 'wing_loading']
df = df[required_columns]

df['Species'] = LabelEncoder().fit_transform(df['Species'])
df['wing_loading'] = pd.to_numeric(df['wing_loading'], errors='coerce')

numerical_cols = ['l2', 'l3p', 'lpd', 'l3', 'w1', 'w2', 'w3', 'wing_loading']
for col in numerical_cols:
    reflected_data = df[col].max() + 1 - df[col]
    df[col + '_log'] = np.log(reflected_data)

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(
    df[[col + '_log' for col in numerical_cols]])
log_num_cols = []
for i in numerical_cols:
    log_num_cols.append(i)
    log_num_cols.append(i+'_log')


df = df.dropna()
pca = PCA(n_components=3)
pca_components = pca.fit_transform(df[log_num_cols])
pca_df = pd.DataFrame(pca_components, columns=[
                      'PC1', 'PC2', 'PC3'])
pca_df['Species'] = df['Species']
pca_df = pca_df.dropna()

X = pca_df.drop('Species', axis=1)
y = pca_df['Species']

final_df = pd.concat([X, y], axis=1)
final_df.to_csv("preprocessed.csv", index=False)

In [19]:
# Without PCA
df = pd.read_csv("data.csv")
required_columns = ['Species', 'Population', 'Temperature', 'Vial', 'Replicate',
                    'Sex', 'Thorax_length', 'l2', 'l3p', 'l3d', 'lpd', 'l3',
                    'w1', 'w2', 'w3', 'wing_loading']
df = df[required_columns]

df['Species'] = LabelEncoder().fit_transform(df['Species'])
df['wing_loading'] = pd.to_numeric(df['wing_loading'], errors='coerce')

numerical_cols = ['l2', 'l3p', 'lpd', 'l3', 'w1', 'w2', 'w3', 'wing_loading']
for col in numerical_cols:
    reflected_data = df[col].max() + 1 - df[col]
    df[col + '_log'] = np.log(reflected_data)

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(
    df[[col + '_log' for col in numerical_cols]])
log_num_cols = []
for i in numerical_cols:
    log_num_cols.append(i)
    log_num_cols.append(i+'_log')


df = df.dropna()

In [25]:
df[['Species', 'l2_log', 'l3p_log', 'lpd_log', 'l3_log', 'w1_log',
    'w2_log', 'w3_log', 'wing_loading_log']].to_csv("data_without_pca.csv", index=False)