In [165]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import matplotlib.pyplot as plt


from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from scipy.stats import pearsonr, spearmanr

from sklearn.model_selection import StratifiedKFold, cross_validate

from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.metrics import make_scorer, f1_score
from sklearn.feature_selection import mutual_info_classif
import numpy as np
from scipy.stats import pearsonr, spearmanr
import pandas as pd



In [166]:
# Replace 'your_file.csv' with the path to your CSV file
train_path = 'train.csv'
test_path = 'test.csv'

# Read the CSV file into a DataFrame
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

print(f"{train_data.columns}, Length = {len(train_data.columns)}")
data = train_data

Index(['UID', 'AgriculturalPostalZone', 'AgricultureZoningCode',
       'CropFieldConfiguration', 'CropSpeciesVariety', 'CultivatedAndWildArea',
       'CultivatedAreaSqft1', 'DistrictId', 'FarmClassification',
       'FarmEquipmentArea', 'FarmShedAreaSqft', 'FarmVehicleCount',
       'FarmingCommunityId', 'FarmingUnitCount', 'FieldConstructionType',
       'FieldEstablishedYear', 'FieldShadeCover', 'FieldSizeSqft',
       'FieldZoneLevel', 'HarvestProcessingType', 'HarvestStorageSqft',
       'HasGreenHouse', 'HasPestControl', 'LandUsageType', 'Latitude',
       'Longitude', 'MainIrrigationSystemCount', 'NationalRegionCode',
       'NaturalLakePresence', 'NumberGreenHouses', 'NumberOfFarmingZones',
       'OtherZoningCode', 'PartialIrrigationSystemCount',
       'PerimeterGuardPlantsArea', 'PrimaryCropAreaSqft',
       'PrimaryCropAreaSqft2', 'RawLocationId', 'ReservoirType',
       'ReservoirWithFilter', 'SoilFertilityType', 'StorageAndFacilityCount',
       'TaxAgrarianValue', 'TaxL

In [167]:
class BestModel:
    def __init__(self, model=None, train_data_fname="'train.csv'", params_list=None, n_fold_cross_validation=10):
        self.model = model  # The model to be tuned
        self.originalTrain = pd.read_csv(train_data_fname)
        self.originalTest = None
        self.train_data = self.pre_process(self.originalTrain, "train")  # Training data
        # self.test_data = self.pre_process(test_data, "test")  # Testing data
        self.test_data = None
        self.params_list = params_list  # Hyperparameters to tune
        self.best_model = None  # To store the best model after tuning
        self.best_params = None  # To store the best parameters
        self.n_fold_cross_validation = n_fold_cross_validation  # Number of folds for cross-validation
        self.columns_to_keep = []  # Columns selected after pre-processing
        self.feature_importance = {}  # Store feature importance (MI)

    def map_categories_to_numeric(self, data, column_name="Target"):
        # Define the category mapping
        category_mapping = {'low': 0, 'medium': 1, 'high': 2}
        
        # Map the column to numeric values
        data[column_name] = data[column_name].map(category_mapping)
        
        return data

    def replace_missing_values(self, data):
        # Iterate over each column and replace missing values
        for column in data.columns:
            # Check if the column is of type float
            if data[column].dtype == 'float64' or data[column].dtype == 'float32':
                # Convert floats that represent integers to int
                if (data[column] % 1 == 0).all():  # Check if all values in the column are effectively integers
                    data[column] = data[column].astype('int64')  # Convert to integer type

                # Replace missing values in float columns with the mean
                mean_value = data[column].mean()
                data[column] = data[column].fillna(mean_value)  # Fixing the warning by avoiding inplace=True

            # Check if the column is of type int (or was converted from float to int)
            elif data[column].dtype == 'int64' or data[column].dtype == 'int32':
                # Replace missing values in integer columns with the mode
                mode_value = data[column].mode()[0]
                data[column] = data[column].fillna(mode_value)  # Fixing the warning by avoiding inplace=True

        return data

    def pre_process(self, data, dataType="train"):
        if dataType == "train":
            data = self.map_categories_to_numeric(data)
            
            # Drop columns with more than a certain threshold of missing values
            threshold = 0.85  # 80% missing values
            threshold_count = int((1 - threshold) * len(data))
            data = data.dropna(axis=1, thresh=threshold_count)

            # Replace missing values in columns
            data = self.replace_missing_values(data)

            # Remove irrelevant columns based on feature-target relationship
            data = self.remove_irrelevant_columns(data)


        elif dataType == "test":
            # Ensure all columns in self.columns_to_keep are in the test set
            missing_columns = [col for col in self.columns_to_keep if col not in data.columns]
            if missing_columns:
                # Add missing columns to the test data with NaN values
                for col in missing_columns:
                    data[col] = pd.NA  # You can replace `pd.NA` with other placeholder values if needed
                # print(f"Added missing columns to the test data: {missing_columns}")

            # Drop any extra columns that are not in self.columns_to_keep
            extra_columns = [col for col in data.columns if col not in self.columns_to_keep]
            if extra_columns:
                data = data.drop(columns=extra_columns)
                # print(f"Dropped extra columns from the test data: {extra_columns}")


            # Reindex columns in the same order as the training data
            data = data.reindex(columns=self.columns_to_keep)

            # Replace missing values in the test data
            data = self.replace_missing_values(data)
            data = data.drop(columns='Target')
            
            # print(f"Final test data columns: {data.columns}")

        return data


    # def remove_irrelevant_columns(self, data):
    #     # Separate features and target
    #     X = data.drop('Target', axis=1)
    #     y = data['Target']
        
    #     # Train a Random Forest model to determine feature importances
    #     model = RandomForestClassifier(random_state=42)
    #     model.fit(X, y)

    #     # Get feature importances
    #     importances = model.feature_importances_

    #     # Create a DataFrame to display feature importances
    #     feature_importance_df = pd.DataFrame({
    #         'Feature': X.columns,
    #         'Importance': importances
    #     }).sort_values(by='Importance', ascending=False)

    #     print("Feature Importances:\n", feature_importance_df)

    #     # Determine columns to remove based on importance threshold
    #     threshold = 0.01  # Adjust the threshold as needed
    #     irrelevant_columns = feature_importance_df[feature_importance_df['Importance'] < threshold]['Feature'].tolist()
        
    #     print(f"Removing columns with low importance: {irrelevant_columns}")
        
    #     # Drop irrelevant columns
    #     data = data.drop(columns=irrelevant_columns)

    #     return data
    
    # def remove_irrelevant_columns(self, data):
    #     # Assuming 'data' is your DataFrame and 'Target' is the name of your target variable
    #     X = data.drop('Target', axis=1)
    #     y = data['Target']

    #     # Initialize an empty list to collect feature correlation dictionaries
    #     correlation_list = []

    #     for column in X.columns:
    #         # Calculate Pearson correlation (only if y is continuous)
    #         pearson_corr, _ = pearsonr(X[column], y)
    #         # Calculate Spearman correlation
    #         spearman_corr, _ = spearmanr(X[column], y)
            
    #         # Append to the list as a dictionary
    #         correlation_list.append({
    #             'Feature': column,
    #             'Pearson': pearson_corr,
    #             'Spearman': spearman_corr
    #         })

    #     # Create a DataFrame from the list
    #     correlation_df = pd.DataFrame(correlation_list)

    #     # Sort features by the absolute value of their Spearman correlations
    #     correlation_df['Abs_Pearson'] = correlation_df['Pearson'].abs()
    #     correlation_df['Abs_Spearman'] = correlation_df['Spearman'].abs()
    #     correlation_df = correlation_df.sort_values(by='Abs_Pearson ', ascending=False)

    #     # Select the top 30 features
    #     top_features = correlation_df['Feature'].head(30).tolist()

    #     # Keep only the top 30 features in the original data
    #     data = data[top_features + ['Target']]


    #     print("Top 30 features based on absolute Spearman correlation:\n", top_features)

    #     return data



    def remove_irrelevant_columns(self, data):
        correlation_threshold=0.01
        if('Target' in data.columns):
            correlation_matrix = data.corr()
            low_corr_columns = correlation_matrix['Target'].apply(lambda x: abs(x) < correlation_threshold)
            columns_to_drop = correlation_matrix.columns[low_corr_columns].tolist()

            data = data.drop(columns=columns_to_drop)
        return data



    def best_model_outputer(self):
        # Define the custom scoring function (F1 score)
        f1_scorer = make_scorer(f1_score, average='macro')

        # Perform Grid Search with Stratified K-Fold and F1 score as the metric
        grid_search = GridSearchCV(
            self.model, 
            self.params_list, 
            cv=self.n_fold_cross_validation, 
            scoring=f1_scorer, 
            n_jobs=-2, 
            verbose=1
        )

        # Use the entire training data for hyperparameter tuning
        grid_search.fit(self.train_data.drop('Target', axis=1), self.train_data['Target'])
        
        # Store the best model and best parameters
        self.best_model = grid_search.best_estimator_
        self.best_params = grid_search.best_params_

        print(f"Best parameters: {self.best_params}")
        print(f"Best F1 score from Grid Search: {grid_search.best_score_}")

        # Perform cross-validation with the best model
        kf = StratifiedKFold(n_splits=self.n_fold_cross_validation, shuffle=True, random_state=42)
        f1_scores_macro = []

        # Cross-validation loop to calculate the macro F1 score for each fold
        for train_index, test_index in kf.split(self.train_data.drop('Target', axis=1), self.train_data['Target']):
            X_train, X_test = self.train_data.drop('Target', axis=1).iloc[train_index], self.train_data.drop('Target', axis=1).iloc[test_index]
            y_train, y_test = self.train_data['Target'].iloc[train_index], self.train_data['Target'].iloc[test_index]
            
            # Train the model on the training set
            self.best_model.fit(X_train, y_train)
            
            # Predict on the test set
            y_pred = self.best_model.predict(X_test)
            
            # Calculate the macro F1 score for this fold
            f1_macro = f1_score(y_test, y_pred, average='macro')
            f1_scores_macro.append(f1_macro)
        
        # Calculate the mean and standard deviation of the macro F1 scores
        mean_f1_macro = np.mean(f1_scores_macro)
        std_f1_macro = np.std(f1_scores_macro)

        print(f"\n\nCross-validation results:")
        print(f"\tMacro F1 Scores: {f1_scores_macro}")
        print(f"\tMean Macro F1 Score: {mean_f1_macro:.4f}")
        print(f"\tStandard Deviation of Macro F1 Score: {std_f1_macro:.4f}")

        return self.best_model, self.best_params

    def predict(self, test_data):
        # Generate predictions using the best model
        predictions = self.best_model.predict(test_data)
        
        # Define the reverse mapping from numeric to category
        reverse_mapping = {0: 'low', 1: 'medium', 2: 'high'}
        
        # Map numeric predictions back to categories
        return pd.Series(predictions).map(reverse_mapping)


    def make_predictions(self, test_fname, predictions_fname):
        # Save the columns to keep for consistency with the test set
        self.columns_to_keep = list(self.train_data.columns)  # Save as a list to maintain order
        print("Columns to keep:", self.columns_to_keep)

        test_data = pd.read_csv(test_fname)
        self.originalTest = test_data
        self.test_data = self.pre_process(data=test_data, dataType="test")
        predictions = self.predict(self.test_data)

        # Step 4: Add UID column from `copy_test` DataFrame to the `reversed_predictions`
        reversed_predictions_df = pd.DataFrame({
            'UID': self.originalTest['UID'],
            'Target': predictions  # The reversed prediction values
        })
        reversed_predictions_df.to_csv(predictions_fname, index=False)
        print("File Made and prediction complete")
        return reversed_predictions_df


In [168]:
def xgboost():
    # Usage Example
    # Assuming train_data and test_data are pandas DataFrames, and model is an estimator like XGBoost
    from xgboost import XGBClassifier

    # Example params list (you can adjust this for your model)
    params_list = {
        'learning_rate': [0.01],
        'n_estimators': [100, 150, 200],
        'max_depth': [7, 9]
    }


    # Assuming `train_data` and `test_data` are already defined
    model = XGBClassifier(random_state=42)

    # Create an instance of bestModel
    best_model_instance = BestModel(model, "train.csv", params_list)

    # Get the best model and its parameters
    best_model, best_params = best_model_instance.best_model_outputer()

    print(f"Best Model of xGBoost with params : {best_params}")
    best_model_instance.make_predictions('test.csv','submission.csv')


In [169]:
def svm():
    from sklearn.svm import SVC

    # Example hyperparameter list for SVM with RBF kernel
    params_list = {
        'C': [0.1, 1, 10],  # Regularization parameter
        'gamma': ['scale', 'auto', 0.1, 1],  # Kernel coefficient
        'kernel': ['rbf']  # RBF kernel
    }

    # Create an SVM model instance
    model = SVC(random_state=42)

    # Create an instance of BestModel
    best_model_instance = BestModel(model, "train.csv", params_list)

    # Get the best model and its parameters
    best_model, best_params = best_model_instance.best_model_outputer()

    print(f"Best Model of SVM with rbf kernal with params : {best_params}")
    best_model_instance.make_predictions('test.csv','submission.csv')


In [170]:
def random_forest():
    from sklearn.ensemble import RandomForestClassifier

    # Example hyperparameter list for Random Forest
    params_list = {
        'n_estimators': [100, 150, 200],  # Number of trees in the forest
        'max_depth': [10, 20, 15],  # Maximum depth of the tree
        'min_samples_split': [2, 5, 10]  # Minimum number of samples required to split a node
    }

    # Create a Random Forest model instance
    model = RandomForestClassifier(random_state=42)

    # Create an instance of BestModel
    best_model_instance = BestModel(model, "train.csv", params_list)

    # Get the best model and its parameters
    best_model, best_params = best_model_instance.best_model_outputer()

    print(f"Best Model of Random Forest with params: {best_params}")
    best_model_instance.make_predictions('test.csv', 'submission.csv')


In [171]:
def knn():
    from sklearn.neighbors import KNeighborsClassifier

    # Example hyperparameter list for KNN
    params_list = {
        'n_neighbors': [3],  # Number of neighbors
        'weights': ['distance'],  # Weight function used in prediction
        'metric': ['manhattan']  # Distance metrics
    }

    # Create a KNN model instance
    model = KNeighborsClassifier()

    # Create an instance of BestModel
    best_model_instance = BestModel(model, "train.csv", params_list)

    # Get the best model and its parameters
    best_model, best_params = best_model_instance.best_model_outputer()

    print(f"Best Model of KNN with params: {best_params}")
    best_model_instance.make_predictions('test.csv', 'submission.csv')


In [172]:
def bi_lstm():
    import pandas as pd
    import numpy as np
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense, Bidirectional
    from sklearn.model_selection import train_test_split

    # Load your training data
    train_data = pd.read_csv("train.csv")
    X = train_data.iloc[:, :-1].values  # Assuming features are all columns except the last
    y = train_data.iloc[:, -1].values   # Assuming the last column is the target

    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Reshape the data for LSTM (samples, timesteps, features)
    X_train = np.expand_dims(X_train, axis=2)
    X_val = np.expand_dims(X_val, axis=2)

    # Build a simple Bi-LSTM model
    model = Sequential()
    model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(X_train.shape[1], 1)))
    model.add(Bidirectional(LSTM(32)))
    model.add(Dense(1, activation='sigmoid'))  # Use 'softmax' for multi-class classification

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

    # Save predictions for the test data
    test_data = pd.read_csv("test.csv")
    X_test = test_data.values
    X_test = np.expand_dims(X_test, axis=2)
    predictions = model.predict(X_test)

    # Save predictions to submission.csv
    pd.DataFrame(predictions, columns=["Predictions"]).to_csv("submission.csv", index=False)

    print("Bi-LSTM model training and predictions completed.")


In [173]:
def linear_regression():
    from sklearn.linear_model import LinearRegression

    # Example hyperparameter list for Linear Regression
    params_list = {
        'fit_intercept': [True, False],  # Whether to calculate the intercept for this model
        'normalize': [True, False],  # Whether to normalize the data before fitting
    }

    # Create a Linear Regression model instance
    model = LinearRegression()

    # Create an instance of BestModel
    best_model_instance = BestModel(model, "train.csv", params_list)

    # Get the best model and its parameters
    best_model, best_params = best_model_instance.best_model_outputer()

    print(f"Best Model of Linear Regression with params: {best_params}")
    best_model_instance.make_predictions('test.csv', 'submission.csv')


In [174]:
def logistic_regression():
    from sklearn.linear_model import LogisticRegression

    # Example hyperparameter list for Logistic Regression
    params_list = {
        'C': [0.1, 1, 10],  # Regularization strength
        'penalty': ['l2'],  # Regularization type
        'solver': ['lbfgs', 'liblinear'],  # Optimization algorithm
        'max_iter': [100, 200]  # Maximum number of iterations for the solver
    }

    # Create a Logistic Regression model instance
    model = LogisticRegression()

    # Create an instance of BestModel
    best_model_instance = BestModel(model, "train.csv", params_list)

    # Get the best model and its parameters
    best_model, best_params = best_model_instance.best_model_outputer()

    print(f"Best Model of Logistic Regression with params: {best_params}")
    best_model_instance.make_predictions('test.csv', 'submission.csv')


In [None]:
# xgboost()
# svm()
# knn()
# random_forest()
# bi_lstm
logistic_regression()
# linear_regression()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Fitting 10 folds for each of 12 candidates, totalling 120 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters: {'C': 10, 'max_iter': 200, 'penalty': 'l2', 'solver': 'lbfgs'}
Best F1 score from Grid Search: 0.2546294366168855


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
print("Done")

Done
