In [4]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from joblib import Parallel, delayed


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
class DataPreprocessor:
    def __init__(self, target_mapping=None, missing_value_threshold=0.11):
        self.target_mapping = target_mapping
        self.missing_value_threshold = missing_value_threshold
        self.categorical_imputer = None
        self.numerical_imputer = None
        self.numerical_cols = None
        self.categorical_cols = None

    # we found this using graphs of feature Vs Target of all available features
    def code_agriculture_zoning(self, ag_id):
        if (ag_id < 20 or (ag_id > 26 and ag_id < 38) or (ag_id > 41 and ag_id < 46)):
            return 1
        elif 19 < ag_id < 27:
            return 2
        else:
            return 3

    def preprocess(self, train_data, test_data):
        # Fill missing 'DistrictId' with 'NationalRegionCode'
        train_data['DistrictId'].fillna(train_data['NationalRegionCode'], inplace=True)
        test_data['DistrictId'].fillna(test_data['NationalRegionCode'], inplace=True)

        # Fill remaining 'DistrictId' values based on 'AgricultureZoningCode'
        train_data['DistrictId'] = train_data['DistrictId'].fillna(
            train_data['AgricultureZoningCode'].apply(self.code_agriculture_zoning)
        )
        test_data['DistrictId'] = test_data['DistrictId'].fillna(
            test_data['AgricultureZoningCode'].apply(self.code_agriculture_zoning)
        )

        # Drop 'NationalRegionCode'
        train_data = train_data.drop(['NationalRegionCode'], axis=1)
        test_data = test_data.drop(['NationalRegionCode'], axis=1)

        # Map 'Target' column to numerical values
        if self.target_mapping:
            train_data['Target'] = train_data['Target'].map(self.target_mapping)

        # Drop columns with missing values above the threshold
        columns_to_drop = self.get_columns_to_drop(train_data)
        train_data = train_data.drop(columns=columns_to_drop)
        test_data = test_data.drop(columns=columns_to_drop)

        # Separate features and target variable
        X_train = train_data.drop(['Target'], axis=1)
        y_train = train_data['Target']
        X_test = test_data

        # Define categorical and numerical columns
        self.update_column_lists(X_train)

        # Impute categorical features with the most frequent value
        self.categorical_imputer = SimpleImputer(strategy='most_frequent')
        X_train[self.categorical_cols] = self.categorical_imputer.fit_transform(X_train[self.categorical_cols])

        # # Impute numerical features with the mean
        # self.numerical_imputer = SimpleImputer(strategy='mean')
        # X_train[self.numerical_cols] = self.numerical_imputer.fit_transform(X_train[self.numerical_cols])

        # Use KNNImputer for numerical features
        self.numerical_imputer = KNNImputer(n_neighbors=5)
        X_train[self.numerical_cols] = self.numerical_imputer.fit_transform(X_train[self.numerical_cols])


        # Transform X_test using the same imputers
        X_test[self.categorical_cols] = self.categorical_imputer.transform(X_test[self.categorical_cols])
        X_test[self.numerical_cols] = self.numerical_imputer.transform(X_test[self.numerical_cols])

        return X_train, y_train, X_test

    def get_columns_to_drop(self, data):
        # Calculate missing values percentage for each column
        missing_percentage = data.isnull().mean()
        # Identify columns to drop (those with missing values above the threshold)
        return missing_percentage[missing_percentage > self.missing_value_threshold].index

    def update_column_lists(self, X_train):
        # Update numerical and categorical column lists after dropping columns with missing values
        self.numerical_cols = [
            'UndergroundStorageSqft', 'WaterAccessPoints', 'WaterAccessPointsCalc',
            'PrimaryCropAreaSqft', 'TotalCultivatedAreaSqft', 'CultivatedAreaSqft1',
            'PerimeterGuardPlantsArea', 'PrimaryCropareainsqft2', 'CultivatedAndWildArea',
            'TotalAreaSqft', 'NumberGreenHouses', 'MainIrrigationSystemCount',
            'FarmVehicleCount', 'FarmEquipmentArea', 'Latitude', 'Longitude', 'FieldSizeSqft',
            'WaterReservoirCount', 'TotalReservoirSize', 'StorageAndFacilityCount',
            'PartialIrrigationSystemCount', 'FarmingUnitCount', 'FarmShedAreaSqft',
            'HarvestStorageSqft', 'FieldEstablishedYear', 'NumberOfFarmingZones',
            'TaxAgrarianValue', 'TaxLandValue', 'TotalValue', 'TotalTaxAssessed',
            'ValuationYear', 'TaxOverdueYear'
        ]
        self.categorical_cols = [
            'SoilFertilityType', 'ReservoirType', 'LandUsageType', 'CropSpeciesVariety',
            'FieldShadeCover', 'HasPestControl', 'ReservoirWithFilter', 'NaturalLakePresence',
            'HasGreenHouse', 'TaxOverdueStatus', 'AgricultureZoningCode', 'OtherZoningCode',
            'TypeOfIrrigationSystem', 'CropFieldConfiguration', 'FarmClassification',
            'HarvestProcessingType', 'FieldZoneLevel', 'FieldConstructionType'
        ]
        self.numerical_cols = [col for col in self.numerical_cols if col in X_train.columns]
        self.categorical_cols = [col for col in self.categorical_cols if col in X_train.columns]

# Model training class
class XGBoostModel:
    def __init__(self):
        self.model = XGBClassifier(objective='multi:softmax', num_class=3)

    def compute_sample_weights(self, y_train):
        # Compute class weights
        class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
        class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}
        # Convert class weights to sample weights
        return y_train.map(class_weights_dict).values

    def train(self, X_train, y_train):
        # Compute sample weights
        sample_weights = self.compute_sample_weights(y_train)
        # Fit the model
        self.model.fit(X_train, y_train, sample_weight=sample_weights)

    def predict(self, X_test):
        return self.model.predict(X_test)

# Cross-validation function with multiprocessing
def cross_validate_model(X, y, model, n_splits=5, n_jobs=-2):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    def evaluate_fold(train_index, val_index):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model.train(X_train, y_train)
        y_pred = model.predict(X_val)

        # Calculate macro F1 score
        return f1_score(y_val, y_pred, average='macro')

    # Use joblib.Parallel to parallelize the cross-validation loop
    f1_scores = Parallel(n_jobs=n_jobs)(
        delayed(evaluate_fold)(train_index, val_index) for train_index, val_index in kf.split(X)
    )

    # Return the average macro F1 score
    return np.mean(f1_scores)

In [6]:
import argparse
def make_predictions(test_fname, predictions_fname):
    train_data = pd.read_csv('train.csv')
    test_data = pd.read_csv(test_fname)

    # Preprocess the data
    preprocessor = DataPreprocessor(target_mapping={'low': 0, 'medium': 1, 'high': 2})
    X_train, y_train, X_test = preprocessor.preprocess(train_data, test_data)


    # Initialize and cross-validate the model
    xgb_model = XGBoostModel()
    average_f1_score = cross_validate_model(X_train, y_train, xgb_model, n_splits=10)

    # Print the average macro F1 score
    print(f"Average Macro F1 Score: {average_f1_score}")


    # Train the model on the full training set and make predictions on the test set
    xgb_model.train(X_train, y_train)
    y_pred = xgb_model.predict(X_test)

    # Map the predicted labels back to their original target values
    target_mapping = {0: 'low', 1: 'medium', 2: 'high'}
    y_pred = pd.Series(y_pred)
    y_pred = y_pred.map(target_mapping)

    # Retrieve the UID column from X_test
    uid_column = X_test['UID']  # Assuming 'UID' is a column in X_test

    # Combine UID with the predicted labels y_pred
    results = pd.DataFrame({'UID': uid_column, 'Target': y_pred})


    # Save the results to a CSV file
    results.to_csv(predictions_fname, index=False)

if __name__=="__main__":
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--train-file", type=str, help="file path of train.csv")
    # parser.add_argument("--test-file", type=str, help="file path of test.csv")
    # parser.add_argument("--predictions-file", type=str, help="save path of predictions")
    # args = parser.parse_args()
    # make_predictions(args.test_file, args.predictions_file)

    make_predictions('test.csv', 'predictions.csv')


Average Macro F1 Score: 0.4336663884406303
