### Data Loading and Preprocessing

In [439]:
import pandas as pd
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
X_train = train[train!='?'].dropna().drop('target', axis = 1)
# X_test = test[test!='?'].dropna().drop('target', axis = 1)
Y_train = train[train!='?'].dropna()['target']
# Y_test = test['target']

In [440]:
print(train.shape, test.shape, X_train.shape, Y_train.shape)

(30557, 82) (16136, 77) (3915, 81) (3915,)


### Check the datatype of each features

In [441]:
# Get the data types of each column
data_types = train.dtypes

# # Print the data types of each column
# for column, dtype in data_types.items():
#     print(f"'{column}' has data type: {dtype}")
#     print(dtype)

# Show the distinct types of the features
print(data_types.unique())

[dtype('O') dtype('float64') dtype('int64')]


from nicholas's ipynb we know the first 3 object types exist no nan values, thus we can do the inputations for the remaining int64/ float64 features.

### Other Imputation methods

#### data imputation by mean/ mode/ median

In [442]:
# function to check if all na is replaced
def check_isna(df):
    return df.isnull().any(axis=1).sum()
    # df.isna().any(axis=1).sum()
print(check_isna(train))

26642


In [443]:
X_mean_train = train.copy()
Y_mean_train = test.copy()
X_mode_train = train.copy()
Y_mode_train = test.copy()
X_median_train = train.copy()
Y_median_train = test.copy()

In [444]:
def statistical_imputation(df, impute_func):
    # print(check_isna(df))
     # Iterate over each column
    for column in df.columns:
        # Check if the column has missing values
        if df[column].isnull().any():
            # Apply the imputation function
            impute_value = impute_func(df[column])
            df[column].fillna(impute_value, inplace=True)
    # print(check_isna(df))
    return df

In [445]:
# Perform mean imputation for each feature column
# print(X_mean_train.isnull().any(axis=1).sum()) #originally is 26642
X_mean_train = statistical_imputation(X_mean_train, lambda x: x.mean())
X_mode_train = statistical_imputation(X_mode_train, lambda x: x.mode().iloc[0])
X_median_train = statistical_imputation(X_median_train, lambda x: x.median)
print()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(impute_value, inplace=True)





1        0.000076
2        0.000067
3        0.000083
4        0.000070
           ...   
30552    0.000063
30553    0.000072
30554    0.000068
30555    0.000086
30556    0.000079
Name: L3_NO2_NO2_column_number_density, Length: 30557, dtype: float64>' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df[column].fillna(impute_value, inplace=True)
1        0.000197
2        0.000170
3        0.000175
4        0.000142
           ...   
30552    0.000165
30553    0.000161
30554    0.000135
30555    0.000148
30556    0.000188
Name: L3_NO2_NO2_slant_column_number_density, Length: 30557, dtype: float64>' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df[column].fillna(impute_value, inplace=True)
1       -1.082553
2       -1.001242
3       -0.777019
4        0.366323
           ...   
30552   -1.412609
30553   -1.449760
30554   -1.722480
30555   -2.065102
30556   -1.556335
Name: L3_NO2_absorbing_aerosol_index

#### Data imputation by regression prediction

In [446]:
import pandas as pd
from sklearn.linear_model import LinearRegression

def regression_impute(df):
    """
    Performs imputation for missing values using regression.

    Args:
        df (pandas DataFrame): The DataFrame to impute.

    Returns:
        pandas DataFrame: The DataFrame with imputed values.
    """
    # Create a copy of the DataFrame for imputation
    imputed_df = df.copy()

    # Iterate over each column
    for column in imputed_df.columns:
        # Check if the column has missing values
        if imputed_df[column].isnull().any():
            # Split the data into features (X) and target (y)
            object_columns = df.select_dtypes(include='O').columns
            X = imputed_df.dropna().drop(column, axis=1).drop(object_columns, axis=1)
            y = imputed_df.dropna()[column]
            print(X.shape, y.shape)
            # Fit a linear regression model
            model = LinearRegression()
            model.fit(X, y)
            if(imputed_df[imputed_df[column].isnull()].drop(column, axis=1).drop(object_columns, axis=1).dropna().shape[0]!=0):
                # Predict missing values
                X_missing = imputed_df[imputed_df[column].isnull()].drop(column, axis=1).drop(object_columns, axis=1).dropna()
                print(X_missing.shape)
                y_missing = model.predict(X_missing)
                # Replace missing values with predictions
                try:
                    imputed_df.loc[imputed_df[column].isnull(), column] = y_missing
                except:
                    print(y_missing) # some of the X_missing.shape is mismatching since more than one column is nan

    return imputed_df

In [447]:
X_regression_train = regression_impute(train)
Y_regression_train = Y_train.copy()
X_regression_train.shape
check_isna(X_regression_train)

(3915, 78) (3915,)
(3915, 78) (3915,)
(3915, 78) (3915,)
(3915, 78) (3915,)
(3915, 78) (3915,)
(3915, 78) (3915,)
(3915, 78) (3915,)
(3915, 78) (3915,)
(3915, 78) (3915,)
(3915, 78) (3915,)
(3915, 78) (3915,)
(3915, 78) (3915,)
(174, 78)
[ 1.71262636e-04  5.80223552e-05  2.16935229e-04  2.37973711e-05
  7.19249744e-05  4.45339553e-05  4.45791457e-05  6.12871093e-05
  2.99844719e-05  1.98020415e-05  4.84109910e-06  1.59058910e-05
  4.05847864e-06  5.43762358e-05  9.36426518e-05  1.55009797e-04
  1.25374371e-04  9.88912080e-05  1.33894401e-04  1.17665510e-04
  7.60084961e-05  1.85226894e-04  1.53127433e-04  1.14545142e-04
 -6.07890459e-06  2.20732691e-05  4.40711185e-05  6.51618322e-05
  6.36291714e-05  1.09731483e-05  4.84371101e-05  3.89883052e-05
  4.20104736e-05  1.32267740e-05  3.00017691e-05  1.38051027e-04
  3.93347310e-05  3.74724768e-05  5.58666746e-05  2.07482963e-05
  2.71878800e-05  9.59723127e-05  4.42638993e-06  3.36595026e-05
  1.06554477e-05 -2.62200425e-05 -8.58248581e-0

26642

#### Data imputation by missing indication with mean imputation

In [448]:
def missing_indicator(df):
    """
    Creates a missing indicator variable for missing values in each column of the DataFrame.

    Args:
        df (pandas DataFrame): The DataFrame to create the missing indicator for.

    Returns:
        pandas DataFrame: The DataFrame with missing indicator variables.
    """
    missing_indicator_df = df.copy()

    for column in missing_indicator_df.columns:
        if df[column].isnull().any():
            missing_values = missing_indicator_df[column].isnull().astype(int)
            missing_indicator_df[column + '_missing'] = missing_values

    return missing_indicator_df

In [449]:
X_missing_train = missing_indicator(train)
Y_missing_train = Y_train.copy()
# check_isna(X_missing_train)
X_missing_train.shape
statistical_imputation(X_missing_train, lambda x: x.mean())
# check_isna(X_missing_train)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(impute_value, inplace=True)


Unnamed: 0,Place_ID X Date,Date,Place_ID,target,target_min,target_max,target_variance,target_count,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,...,L3_SO2_sensor_zenith_angle_missing,L3_SO2_solar_azimuth_angle_missing,L3_SO2_solar_zenith_angle_missing,L3_CH4_CH4_column_volume_mixing_ratio_dry_air_missing,L3_CH4_aerosol_height_missing,L3_CH4_aerosol_optical_depth_missing,L3_CH4_sensor_azimuth_angle_missing,L3_CH4_sensor_zenith_angle_missing,L3_CH4_solar_azimuth_angle_missing,L3_CH4_solar_zenith_angle_missing
0,010Q650 X 2020-01-02,2020-01-02,010Q650,38.0,23.0,53.0,769.50,92,11.000000,60.200001,...,0,0,0,0,0,0,0,0,0,0
1,010Q650 X 2020-01-03,2020-01-03,010Q650,39.0,25.0,63.0,1319.85,91,14.600000,48.799999,...,0,0,0,0,0,0,0,0,0,0
2,010Q650 X 2020-01-04,2020-01-04,010Q650,24.0,8.0,56.0,1181.96,96,16.400000,33.400002,...,0,0,0,1,1,1,1,1,1,1
3,010Q650 X 2020-01-05,2020-01-05,010Q650,49.0,10.0,55.0,1113.67,96,6.911948,21.300001,...,0,0,0,1,1,1,1,1,1,1
4,010Q650 X 2020-01-06,2020-01-06,010Q650,21.0,9.0,52.0,1164.82,95,13.900001,44.700001,...,0,0,0,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30552,YWSFY6Q X 2020-03-15,2020-03-15,YWSFY6Q,22.0,14.0,83.0,3848.86,72,6.700000,68.300003,...,0,0,0,1,1,1,1,1,1,1
30553,YWSFY6Q X 2020-03-16,2020-03-16,YWSFY6Q,53.0,30.0,146.0,9823.87,72,6.300000,77.700005,...,0,0,0,0,0,0,0,0,0,0
30554,YWSFY6Q X 2020-03-17,2020-03-17,YWSFY6Q,85.0,52.0,153.0,8900.85,72,7.100000,68.500000,...,0,0,0,1,1,1,1,1,1,1
30555,YWSFY6Q X 2020-03-18,2020-03-18,YWSFY6Q,103.0,33.0,149.0,13963.90,72,19.100000,66.300003,...,0,0,0,1,1,1,1,1,1,1


```
Here are the 3 different imputation methods that can be done. 
Please use these for your references.
X_mean_train 
Y_mean_train 
X_mode_train 
Y_mode_train 
X_median_train 
Y_median_train 
X_regression_train (not finished, please help me modify it if you are free)
Y_regression_train 
X_missing_train
Y_missing_train
```