In [171]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import matplotlib.pyplot as plt


from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from scipy.stats import pearsonr, spearmanr

from sklearn.model_selection import StratifiedKFold, cross_validate

from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.metrics import make_scorer, f1_score
from sklearn.feature_selection import mutual_info_classif
import numpy as np
from scipy.stats import pearsonr, spearmanr
import pandas as pd



In [172]:
# Replace 'your_file.csv' with the path to your CSV file
train_path = 'train.csv'
test_path = 'test.csv'

# Read the CSV file into a DataFrame
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

print(f"{train_data.columns}, Length = {len(train_data.columns)}")
data = train_data

Index(['UID', 'AgriculturalPostalZone', 'AgricultureZoningCode',
       'CropFieldConfiguration', 'CropSpeciesVariety', 'CultivatedAndWildArea',
       'CultivatedAreaSqft1', 'DistrictId', 'FarmClassification',
       'FarmEquipmentArea', 'FarmShedAreaSqft', 'FarmVehicleCount',
       'FarmingCommunityId', 'FarmingUnitCount', 'FieldConstructionType',
       'FieldEstablishedYear', 'FieldShadeCover', 'FieldSizeSqft',
       'FieldZoneLevel', 'HarvestProcessingType', 'HarvestStorageSqft',
       'HasGreenHouse', 'HasPestControl', 'LandUsageType', 'Latitude',
       'Longitude', 'MainIrrigationSystemCount', 'NationalRegionCode',
       'NaturalLakePresence', 'NumberGreenHouses', 'NumberOfFarmingZones',
       'OtherZoningCode', 'PartialIrrigationSystemCount',
       'PerimeterGuardPlantsArea', 'PrimaryCropAreaSqft',
       'PrimaryCropAreaSqft2', 'RawLocationId', 'ReservoirType',
       'ReservoirWithFilter', 'SoilFertilityType', 'StorageAndFacilityCount',
       'TaxAgrarianValue', 'TaxL

In [None]:
class BestModel:
    def __init__(self, model=None, train_data_fname="'train.csv'", params_list=None, n_fold_cross_validation=10):
        self.model = model  # The model to be tuned
        self.originalTrain = pd.read_csv(train_data_fname)
        self.originalTest = None
        self.train_data = self.pre_process(self.originalTrain, "train")  # Training data
        # self.test_data = self.pre_process(test_data, "test")  # Testing data
        self.test_data = None
        self.params_list = params_list  # Hyperparameters to tune
        self.best_model = None  # To store the best model after tuning
        self.best_params = None  # To store the best parameters
        self.n_fold_cross_validation = n_fold_cross_validation  # Number of folds for cross-validation
        self.columns_to_keep = []  # Columns selected after pre-processing
        self.feature_importance = {}  # Store feature importance (MI)

    def map_categories_to_numeric(self, data, column_name="Target"):
        # Define the category mapping
        category_mapping = {'low': 0, 'medium': 1, 'high': 2}
        
        # Map the column to numeric values
        data[column_name] = data[column_name].map(category_mapping)
        
        return data

    def replace_missing_values(self, data):
        # Iterate over each column and replace missing values
        for column in data.columns:
            # Check if the column is of type float
            if data[column].dtype == 'float64' or data[column].dtype == 'float32':
                # Convert floats that represent integers to int
                if (data[column] % 1 == 0).all():  # Check if all values in the column are effectively integers
                    data[column] = data[column].astype('int64')  # Convert to integer type

                # Replace missing values in float columns with the mean
                mean_value = data[column].mean()
                data[column] = data[column].fillna(mean_value)  # Fixing the warning by avoiding inplace=True

            # Check if the column is of type int (or was converted from float to int)
            elif data[column].dtype == 'int64' or data[column].dtype == 'int32':
                # Replace missing values in integer columns with the mode
                mode_value = data[column].mode()[0]
                data[column] = data[column].fillna(mode_value)  # Fixing the warning by avoiding inplace=True

        return data

    def pre_process(self, data, dataType="train"):
        if dataType == "train":
            data = self.map_categories_to_numeric(data)
            
            # Drop columns with more than a certain threshold of missing values
            threshold = 0.8  # 80% missing values
            threshold_count = int((1 - threshold) * len(data))
            data = data.dropna(axis=1, thresh=threshold_count)

            # Replace missing values in columns
            data = self.replace_missing_values(data)

            # Remove irrelevant columns based on feature-target relationship
            data = self.remove_irrelevant_columns(data)


        elif dataType == "test":
            # Ensure all columns in self.columns_to_keep are in the test set
            missing_columns = [col for col in self.columns_to_keep if col not in data.columns]
            if missing_columns:
                # Add missing columns to the test data with NaN values
                for col in missing_columns:
                    data[col] = pd.NA  # You can replace `pd.NA` with other placeholder values if needed
                # print(f"Added missing columns to the test data: {missing_columns}")

            # Drop any extra columns that are not in self.columns_to_keep
            extra_columns = [col for col in data.columns if col not in self.columns_to_keep]
            if extra_columns:
                data = data.drop(columns=extra_columns)
                # print(f"Dropped extra columns from the test data: {extra_columns}")


            # Reindex columns in the same order as the training data
            data = data.reindex(columns=self.columns_to_keep)

            # Replace missing values in the test data
            data = self.replace_missing_values(data)
            data = data.drop(columns='Target')
            
            # print(f"Final test data columns: {data.columns}")

        return data


    def remove_irrelevant_columns(self, data):
        # Calculate Mutual Information for classification (feature importance)
        mutual_info = mutual_info_classif(data.drop('Target', axis=1), data['Target'])
        feature_importance = dict(zip(data.drop('Target', axis=1).columns, mutual_info))
        
        # Threshold for mutual information to decide which columns are irrelevant
        irrelevant_columns = [column for column, importance in feature_importance.items() if importance < 0.015]  # You can adjust this threshold
        
        print(f"Removing columns with low mutual information: {irrelevant_columns}")
        data = data.drop(columns=irrelevant_columns)

        # Pearson and Spearman correlations to remove correlated features
        correlation_matrix = data.corr()
        for column in data.columns:
            try:
                # Skip columns that are constant (no variance)
                if data[column].nunique() == 1:
                    # print(f"Skipping {column} due to constant values.")
                    continue
                
                # Pearson Correlation with Target
                pearson_corr, _ = pearsonr(data[column], data['Target'])
                if abs(pearson_corr) < 0.015:  # A low correlation threshold
                    # print(f"Removing {column} due to low Pearson correlation with Target: {pearson_corr}")
                    data = data.drop(columns=[column])

                # Spearman Correlation with Target
                spearman_corr, _ = spearmanr(data[column], data['Target'])
                if abs(spearman_corr) < 0.015:  # A low correlation threshold
                    # print(f"Removing {column} due to low Spearman correlation with Target: {spearman_corr}")
                    data = data.drop(columns=[column])

            except Exception as e:
                print(f"Error calculating correlation for {column}: {e}")

        return data


    def best_model_outputer(self):
        # Define the custom scoring function (F1 score)
        f1_scorer = make_scorer(f1_score, average='macro')

        # Perform Grid Search with Stratified K-Fold and F1 score as the metric
        grid_search = GridSearchCV(
            self.model, 
            self.params_list, 
            cv=self.n_fold_cross_validation, 
            scoring=f1_scorer, 
            n_jobs=-2, 
            verbose=1
        )

        # Use the entire training data for hyperparameter tuning
        grid_search.fit(self.train_data.drop('Target', axis=1), self.train_data['Target'])
        
        # Store the best model and best parameters
        self.best_model = grid_search.best_estimator_
        self.best_params = grid_search.best_params_

        print(f"Best parameters: {self.best_params}")
        print(f"Best F1 score from Grid Search: {grid_search.best_score_}")

        # Perform cross-validation with the best model
        kf = StratifiedKFold(n_splits=self.n_fold_cross_validation, shuffle=True, random_state=42)
        f1_scores_macro = []

        # Cross-validation loop to calculate the macro F1 score for each fold
        for train_index, test_index in kf.split(self.train_data.drop('Target', axis=1), self.train_data['Target']):
            X_train, X_test = self.train_data.drop('Target', axis=1).iloc[train_index], self.train_data.drop('Target', axis=1).iloc[test_index]
            y_train, y_test = self.train_data['Target'].iloc[train_index], self.train_data['Target'].iloc[test_index]
            
            # Train the model on the training set
            self.best_model.fit(X_train, y_train)
            
            # Predict on the test set
            y_pred = self.best_model.predict(X_test)
            
            # Calculate the macro F1 score for this fold
            f1_macro = f1_score(y_test, y_pred, average='macro')
            f1_scores_macro.append(f1_macro)
        
        # Calculate the mean and standard deviation of the macro F1 scores
        mean_f1_macro = np.mean(f1_scores_macro)
        std_f1_macro = np.std(f1_scores_macro)

        print(f"\n\nCross-validation results:")
        print(f"\tMacro F1 Scores: {f1_scores_macro}")
        print(f"\tMean Macro F1 Score: {mean_f1_macro:.4f}")
        print(f"\tStandard Deviation of Macro F1 Score: {std_f1_macro:.4f}")

        return self.best_model, self.best_params

    def predict(self, test_data):
        # Generate predictions using the best model
        predictions = self.best_model.predict(test_data)
        
        # Define the reverse mapping from numeric to category
        reverse_mapping = {0: 'low', 1: 'medium', 2: 'high'}
        
        # Map numeric predictions back to categories
        return pd.Series(predictions).map(reverse_mapping)


    def make_predictions(self, test_fname, predictions_fname):
        # Save the columns to keep for consistency with the test set
        self.columns_to_keep = list(self.train_data.columns)  # Save as a list to maintain order
        print("Columns to keep:", self.columns_to_keep)

        test_data = pd.read_csv(test_fname)
        self.originalTest = test_data
        self.test_data = self.pre_process(data=test_data, dataType="test")
        predictions = self.predict(self.test_data)

        # Step 4: Add UID column from `copy_test` DataFrame to the `reversed_predictions`
        reversed_predictions_df = pd.DataFrame({
            'UID': self.originalTest['UID'],
            'Target': predictions  # The reversed prediction values
        })
        reversed_predictions_df.to_csv(predictions_fname, index=False)
        return reversed_predictions_df


In [174]:
# Usage Example
# Assuming train_data and test_data are pandas DataFrames, and model is an estimator like XGBoost
from xgboost import XGBClassifier

# Example params list (you can adjust this for your model)
params_list = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7]
}

params_list = {
    'learning_rate': [0.5],
    'n_estimators': [2],
    'max_depth': [3]
}


# Assuming `train_data` and `test_data` are already defined
model = XGBClassifier(random_state=42)

# Create an instance of bestModel
best_model_instance = BestModel(model, "train.csv", params_list)

# Get the best model and its parameters
best_model, best_params = best_model_instance.best_model_outputer()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Removing columns with low mutual information: ['UID', 'AgriculturalPostalZone', 'AgricultureZoningCode', 'CropSpeciesVariety', 'DistrictId', 'FarmVehicleCount', 'FieldEstablishedYear', 'FieldSizeSqft', 'HarvestProcessingType', 'LandUsageType', 'MainIrrigationSystemCount', 'NationalRegionCode', 'NumberOfFarmingZones', 'OtherZoningCode', 'SoilFertilityType', 'StorageAndFacilityCount', 'TaxAgrarianValue', 'TaxLandValue', 'TotalCultivatedAreaSqft', 'TotalTaxAssessed', 'TotalValue', 'TypeOfIrrigationSystem', 'ValuationYear', 'WaterAccessPoints', 'WaterAccessPointsCalc', 'WaterReservoirCount']
Removing FarmingCommunityId due to low Pearson correlation with Target: 0.0031323011796754218
Error calculating correlation for FarmingCommunityId: 'FarmingCommunityId'
Removing FarmingUnitCount due to low Pearson correlation with Target: -0.0015904990665650289
Error calculating correlation for FarmingUnitCount: 'FarmingUnitCount'
Removing Longitude due to low Pearson correlation with Target: 0.0112298

In [175]:
print(f"Best Model of xGBoost with params : {best_params}")
best_model_instance.make_predictions('test.csv','submission.csv')


Best Model of xGBoost with params : {'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 2}
Columns to keep: ['CultivatedAreaSqft1', 'FarmEquipmentArea', 'Latitude', 'RawLocationId', 'TownId', 'Target']
Added missing columns to the test data: ['Target']
Dropped extra columns from the test data: ['UID', 'AgriculturalPostalZone', 'AgricultureZoningCode', 'CropFieldConfiguration', 'CropSpeciesVariety', 'CultivatedAndWildArea', 'DistrictId', 'FarmClassification', 'FarmShedAreaSqft', 'FarmVehicleCount', 'FarmingCommunityId', 'FarmingUnitCount', 'FieldConstructionType', 'FieldEstablishedYear', 'FieldShadeCover', 'FieldSizeSqft', 'FieldZoneLevel', 'HarvestProcessingType', 'HarvestStorageSqft', 'HasGreenHouse', 'HasPestControl', 'LandUsageType', 'Longitude', 'MainIrrigationSystemCount', 'NationalRegionCode', 'NaturalLakePresence', 'NumberGreenHouses', 'NumberOfFarmingZones', 'OtherZoningCode', 'PartialIrrigationSystemCount', 'PerimeterGuardPlantsArea', 'PrimaryCropAreaSqft', 'PrimaryCropArea

Unnamed: 0,UID,Target
0,130000,medium
1,129101,medium
2,147876,medium
3,122624,medium
4,159920,medium
...,...,...
15916,99588,medium
15917,86801,medium
15918,68439,medium
15919,13210,medium
