In [2]:
import pandas as pd
import numpy as np

In [3]:
# Sample dataset
data = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5],
    'Category': ['A', 'B', 'A', 'C', 'B'],
    'Numerical_Value': [10, np.nan, 30, 40, 50],
    'Text_Value': ['Good', 'Bad', 'Excellent', 'Good', 'Excellent']
})

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [6]:
# 1. Data Cleaning
# Handle missing values
imputer = SimpleImputer(strategy='mean')
data[['Numerical_Value']] = imputer.fit_transform(data[['Numerical_Value']])

In [52]:
# 2. Data Transformation
# Encode categorical variables
encoder = OneHotEncoder()
encoded_categorical = pd.DataFrame(encoder.fit_transform(data[['Category']]).toarray(),
                                   columns=encoder.get_feature_names(['Category']))
data = pd.concat([data, encoded_categorical], axis=1)
data = data.drop(['Category'], axis=1)



In [53]:
# 3. Feature Engineering
# Extract features from text
data['Text_Length'] = data['Text_Value'].apply(len)

# Dimensionality reduction
pca = PCA(n_components=1)
data['PCA_Component'] = pca.fit_transform(data[['Numerical_Value', 'Text_Length']])
data

Unnamed: 0,ID,Numerical_Value,Text_Value,Category_A,Category_B,Category_C,Text_Length,PCA_Component
0,1,10.0,Good,1.0,0.0,0.0,4,22.571069
1,2,32.5,Bad,0.0,1.0,0.0,3,0.247013
2,3,30.0,Excellent,1.0,0.0,0.0,9,2.207952
3,4,40.0,Good,0.0,0.0,1.0,4,-7.311964
4,5,50.0,Excellent,0.0,1.0,0.0,9,-17.71407


In [54]:
# 4. Data Splitting
X = data.drop(['ID', 'Text_Value'], axis=1)
y = data['ID']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
# 5. Data Normalization
# Scale numerical features
scaler = StandardScaler()
X_train[['Numerical_Value', 'Text_Length']] = scaler.fit_transform(X_train[['Numerical_Value', 'Text_Length']])
X_test[['Numerical_Value', 'Text_Length']] = scaler.transform(X_test[['Numerical_Value', 'Text_Length']])

In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [12]:
# Load the transportation dataset
transport_data = pd.read_csv('synthetic_data.csv')
transport_data.head()

Unnamed: 0,Date,Time,Stop/Station,Passenger_Count,Vehicle_ID,Latitude,Longitude,Temperature (°C),Precipitation (mm),Humidity (%),Age_Group,Gender,Feedback
0,2023-11-16,12:03,Johor Bahru,46,TRAIN82,3.906935,106.068464,11,3,63,18-24,Male,Driver was friendly
1,2023-07-14,05:07,Cameron Highlands,17,TRAIN65,4.227106,118.407191,3,3,74,25-40,Female,Seats were uncomfortable
2,2023-09-22,14:11,Ipoh,91,TRAIN38,6.819556,101.272984,27,1,81,40-60,Male,Delay in departure
3,2022-07-12,09:11,Penang,41,BUS245,3.627521,106.22699,1,7,98,25-40,Female,Driver was friendly
4,2023-12-09,16:59,Kuching,53,BUS958,1.418952,117.050925,15,9,71,40-60,Male,Service was excellent


In [13]:
# 1. Data Cleaning
# Handle missing values
imputer = SimpleImputer(strategy='mean')
transport_data[['Passenger_Count', 'Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)']] = imputer.fit_transform(transport_data[['Passenger_Count', 'Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)']])

In [14]:
transport_data = transport_data.rename(columns={'Age Group': 'Age_Group'})

In [15]:
# 2. Data Transformation
# Encode categorical variables
encoder = OneHotEncoder()
encoded_categorical = pd.DataFrame(encoder.fit_transform(transport_data[['Age_Group', 'Gender']]).toarray(), columns=encoder.get_feature_names(['Age_Group', 'Gender']))
transport_data = pd.concat([transport_data, encoded_categorical], axis=1)
transport_data = transport_data.drop(['Age_Group', 'Gender'], axis=1)



In [16]:
# 3. Feature Engineering
# Create new features
transport_data['Temperature_Humidity_Ratio'] = transport_data['Temperature (°C)'] / transport_data['Humidity (%)']

In [17]:
# Dimensionality reduction
pca = PCA(n_components=2)
transport_data_pca = pca.fit_transform(transport_data[['Passenger_Count', 'Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)']])
transport_data['PCA_Component1'] = transport_data_pca[:, 0]
transport_data['PCA_Component2'] = transport_data_pca[:, 1]

In [18]:
# 4. Data Splitting
X = transport_data.drop(['Passenger_Count'], axis=1)
y = transport_data['Passenger_Count']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# 5. Data Normalization
# Scale numerical features
scaler = StandardScaler()
X_train[['Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)', 'Temperature_Humidity_Ratio', 'PCA_Component1', 'PCA_Component2']] = scaler.fit_transform(X_train[['Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)', 'Temperature_Humidity_Ratio', 'PCA_Component1', 'PCA_Component2']])
X_test[['Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)', 'Temperature_Humidity_Ratio', 'PCA_Component1', 'PCA_Component2']] = scaler.transform(X_test[['Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)', 'Temperature_Humidity_Ratio', 'PCA_Component1', 'PCA_Component2']])