In [44]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Load the data
train_df = pd.read_csv('/Users/yashaswipatki/Downloads/train.csv')  
test_df = pd.read_csv('/Users/yashaswipatki/Downloads/test.csv')   

In [45]:
# Identify and remove columns with zero variance
zero_variance_cols = train_df.columns[train_df.apply(pd.Series.nunique) == 1]
train_df = train_df.drop(zero_variance_cols, axis=1)
test_df = test_df.drop(zero_variance_cols, axis=1)

# Display the updated datasets
print("Train Data after removing zero variance columns:\n", train_df.head())
print("\nTest Data after removing zero variance columns:\n", test_df.head())

Train Data after removing zero variance columns:
    ID       y  X0 X1  X2 X3 X4 X5 X6 X8  ...  X375  X376  X377  X378  X379  \
0   0  130.81   k  v  at  a  d  u  j  o  ...     0     0     1     0     0   
1   6   88.53   k  t  av  e  d  y  l  o  ...     1     0     0     0     0   
2   7   76.26  az  w   n  c  d  x  j  x  ...     0     0     0     0     0   
3   9   80.62  az  t   n  f  d  x  l  e  ...     0     0     0     0     0   
4  13   78.02  az  v   n  f  d  h  d  n  ...     0     0     0     0     0   

   X380  X382  X383  X384  X385  
0     0     0     0     0     0  
1     0     0     0     0     0  
2     0     1     0     0     0  
3     0     0     0     0     0  
4     0     0     0     0     0  

[5 rows x 366 columns]

Test Data after removing zero variance columns:
    ID  X0 X1  X2 X3 X4 X5 X6 X8  X10  ...  X375  X376  X377  X378  X379  X380  \
0   1  az  v   n  f  d  t  a  w    0  ...     0     0     0     1     0     0   
1   2   t  b  ai  a  d  b  g  y    0  ...

In [46]:
# Check for null values
print("Train Data - Null Values:\n", train_df.isnull().sum())
print("\nTest Data - Null Values:\n", test_df.isnull().sum())

# Check for unique values
print("\nTrain Data - Unique Values:\n", train_df.nunique())
print("\nTest Data - Unique Values:\n", test_df.nunique())

Train Data - Null Values:
 ID      0
y       0
X0      0
X1      0
X2      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 366, dtype: int64

Test Data - Null Values:
 ID      0
X0      0
X1      0
X2      0
X3      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 365, dtype: int64

Train Data - Unique Values:
 ID      4209
y       2545
X0        47
X1        27
X2        44
        ... 
X380       2
X382       2
X383       2
X384       2
X385       2
Length: 366, dtype: int64

Test Data - Unique Values:
 ID      4209
X0        49
X1        27
X2        45
X3         7
        ... 
X380       2
X382       2
X383       2
X384       2
X385       2
Length: 365, dtype: int64


In [47]:
# Combine train and test data for label encoding
combined_df = pd.concat([train_df, test_df], axis=0)

# Identify categorical columns for label encoding
categorical_cols = combined_df.select_dtypes(include=['object']).columns

# Convert all values to strings before label encoding
combined_df[categorical_cols] = combined_df[categorical_cols].astype(str)

# Apply label encoder to combined data
label_encoder = LabelEncoder()
for col in categorical_cols:
    combined_df[col] = label_encoder.fit_transform(combined_df[col])

# Split the combined data back into train and test sets
train_df = combined_df[:len(train_df)]
test_df = combined_df[len(train_df):]

# Display the updated datasets
print("Train Data after label encoding:\n", train_df.head())
print("\nTest Data after label encoding:\n", test_df.head())

Train Data after label encoding:
    ID       y  X0  X1  X2  X3  X4  X5  X6  X8  ...  X375  X376  X377  X378  \
0   0  130.81  37  23  20   0   3  27   9  14  ...     0     0     1     0   
1   6   88.53  37  21  22   4   3  31  11  14  ...     1     0     0     0   
2   7   76.26  24  24  38   2   3  30   9  23  ...     0     0     0     0   
3   9   80.62  24  21  38   5   3  30  11   4  ...     0     0     0     0   
4  13   78.02  24  23  38   5   3  14   3  13  ...     0     0     0     0   

   X379  X380  X382  X383  X384  X385  
0     0     0     0     0     0     0  
1     0     0     0     0     0     0  
2     0     0     1     0     0     0  
3     0     0     0     0     0     0  
4     0     0     0     0     0     0  

[5 rows x 366 columns]

Test Data after label encoding:
    ID   y  X0  X1  X2  X3  X4  X5  X6  X8  ...  X375  X376  X377  X378  X379  \
0   1 NaN  24  23  38   5   3  26   0  22  ...     0     0     0     1     0   
1   2 NaN  46   3   9   0   3   9   6  

In [48]:
# Combine train and test data for dimensionality reduction
combined_df = pd.concat([train_df, test_df], axis=0)

# Drop rows with missing values
combined_df.dropna(inplace=True)

# Identify features
features = combined_df

# Standardize the features
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

# Apply PCA
pca = PCA(n_components=0.95)  # Retain 95% of the variance
features_pca = pca.fit_transform(features_standardized)

# Split the combined data back into train and test sets
train_df_pca = pd.DataFrame(features_pca[:len(train_df)], columns=[f'PCA_{i+1}' for i in range(features_pca.shape[1])])
test_df_pca = pd.DataFrame(features_pca[len(train_df):], columns=[f'PCA_{i+1}' for i in range(features_pca.shape[1])])

# Display the updated datasets
print("Train Data after dimensionality reduction:\n", train_df_pca.head())
print("\nTest Data after dimensionality reduction:\n", test_df_pca.head())

Train Data after dimensionality reduction:
        PCA_1      PCA_2     PCA_3     PCA_4     PCA_5     PCA_6     PCA_7  \
0  12.355577  -2.931375 -0.964477  1.871028 -1.131071 -3.802995  9.041632   
1  -0.146158   0.443786  0.900259  1.328665 -2.575386 -0.199335  1.010269   
2   9.911795  21.433722 -4.588380 -4.588246  0.611327  2.658488  1.143245   
3   6.999444  21.646649 -5.535007 -0.071694  2.115799  0.657452  0.412677   
4   6.203074  21.740216 -6.092918  0.606826  2.195153 -0.074753 -0.993929   

      PCA_8      PCA_9    PCA_10  ...    PCA_141   PCA_142   PCA_143  \
0 -3.593319 -16.149030  8.407628  ...  -0.112764 -0.246420  0.056195   
1 -0.592836  -0.486332  0.144286  ...   0.987769 -0.322127 -0.975282   
2  3.642861  -0.919221  1.314535  ... -11.603568 -7.603832  0.683216   
3 -0.036918   0.164190  2.929633  ...   1.986847  0.871403  0.650220   
4 -0.231459   0.873334  0.814194  ... -12.266493 -8.932799  2.265767   

    PCA_144   PCA_145   PCA_146   PCA_147   PCA_148   PCA_14

In [49]:
pip install xgboost


Note: you may need to restart the kernel to use updated packages.


In [50]:
print(combined_df.columns)

Index(['ID', 'y', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=366)


In [62]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Assuming you have already performed the previous steps including dimensionality reduction

# Define the target variable
target_column = 'y'  # Replace with the actual target column name
target = combined_df[target_column]

# Split the train_df_pca into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df_pca, target, test_size=0.2, random_state=42)

# Create an XGBoost regressor
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Train the model on the training set
xgb_reg.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = xgb_reg.predict(X_val)

# Evaluate the model on the validation set (you can use different metrics depending on your task)
mse = mean_squared_error(y_val, y_val_pred)
print(f'Mean Squared Error on Validation Set: {mse}')

# Now, predict the test_df values
print("Number of rows in test_df_pca:", test_df_pca.shape[0])  # Print the number of rows in test_df_pca
test_predictions = xgb_reg.predict(test_df_pca)

# Check if test_predictions is empty or contains NaN values
if test_predictions is None or any(np.isnan(test_predictions)):
    print("Test predictions are empty or contain NaN values.")
else:
    # Display the predicted values for the test_df
    print("Predicted Values for test_df:\n", test_predictions)


Mean Squared Error on Validation Set: 57.32378750579196
Number of rows in test_df_pca: 0
Predicted Values for test_df:
 []
