Loading all the libraries

In [27]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight

loading data to store median and mode values

In [30]:
original_data = pd.read_csv('train.csv')
original_data = original_data.drop(['UID', 'Target'], axis=1)
# Store the mode of specific categorical columns which i have identified from the data

categorical_cols = ['SoilFertilityType', 'TypeOfIrrigationSystem', 'CropFieldConfiguration',
                       'FarmClassification', 'HarvestProcessingType', 'LandUsageType','DistrictId','NationalRegionCode']
mode_values = {col: original_data[col].mode()[0] for col in categorical_cols}

# Store the median values of each column from original_data
median_values = original_data.median()

here is function which creates additional colums which helps us to improvw the efficiency 

In [33]:
def create_additional_features(df):
    """Create more sophisticated engineered features based on domain knowledge"""
    data = df.copy()
    
    # 1. Water-related ratios and interactions
    data['water_density'] = data['WaterAccessPoints'] / data['FieldSizeSqft']
    data['water_per_cultivated'] = data['WaterAccessPoints'] / data['TotalCultivatedAreaSqft']
    data['reservoir_density'] = data['WaterReservoirCount'] / data['FieldSizeSqft']
    
    # 2. Cultivation efficiency metrics
    data['cultivation_ratio'] = data['TotalCultivatedAreaSqft'] / data['FieldSizeSqft']
    data['greenhouse_density'] = data['NumberGreenHouses'] / data['FieldSizeSqft']
    data['farming_intensity'] = data['FarmingUnitCount'] / data['TotalCultivatedAreaSqft']
    
    # 3. Infrastructure utilization
    data['storage_per_area'] = (data['UndergroundStorageSqft'] + data['HarvestStorageSqft']) / data['FieldSizeSqft']
    data['equipment_ratio'] = data['FarmEquipmentArea'] / data['FieldSizeSqft']
    
    # 4. Economic indicators
    data['value_per_sqft'] = data['TotalValue'] / data['FieldSizeSqft']
    data['tax_burden'] = data['TotalTaxAssessed'] / data['TotalValue']
    
    # 5. Operational scale indicators
    data['vehicle_per_area'] = data['FarmVehicleCount'] / data['TotalCultivatedAreaSqft']
    data['irrigation_coverage'] = data['MainIrrigationSystemCount'] / data['TotalCultivatedAreaSqft']
    
    # 6. Field age and development
    current_year = 2024
    data['field_age'] = current_year - data['FieldEstablishedYear']
    
    # 7. Location-based features
    data['location_cluster'] = data['Latitude'].astype(str) + '_' + data['Longitude'].astype(str)
    
    return data

here is the function which preprocess the data

In [36]:
def preprocessing(df, is_train=True):
    
    data = df.copy()
    
    categorical_cols = ['SoilFertilityType', 'TypeOfIrrigationSystem', 'CropFieldConfiguration',
                       'FarmClassification', 'HarvestProcessingType', 'LandUsageType','DistrictId','NationalRegionCode']
      #1. Handle categorical variables
    for col in categorical_cols:
        if data[col].isnull().sum() > 0:
            # Use mode for categorical variables
            data[col] = data[col].fillna(mode_values[col])
      # 2.Handle numerical variables
        
    numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_cols:
        if data[col].isnull().sum() > 0:
            if col in ['WaterAccessPoints', 'WaterAccessPointsCalc']:
                # Use nearest neighbor interpolation for water-related features
                data[col] = data[col].interpolate(method='nearest')
            elif 'Area' in col or 'Sqft' in col:
                # Use linear interpolation for area-related features
                data[col] = data[col].interpolate(method='linear')
            else:
                # Use median for other numeric features
                data[col] = data[col].fillna(median_values[col])
    
    

    
    # 3. Create advanced features
    data = create_additional_features(data)
    
    # 4. Encode categorical variables
    categorical_cols = data.select_dtypes(include=['object']).columns
    label_encoders = {}
    
    for col in categorical_cols:
        if col not in ['UID', 'Target']:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col].astype(str))
            label_encoders[col] = le
    
    # 5. Scale numeric features
    numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
    numeric_cols = [col for col in numeric_cols if col not in ['UID', 'Target']]
    
    scaler = StandardScaler()
    data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
    
    # 6. Handle outliers using IQR method
    for col in numeric_cols:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        data[col] = data[col].clip(lower_bound, upper_bound)
    
    return data



here is the model model which we have built

the hyperparameters used are obtained through grid search cv

In [40]:
def train_model(X, y):
    
    # Calculate class weights
    classes = np.unique(y)
    class_weights = compute_class_weight('balanced', classes=classes, y=y)
    class_weights_dict = {i: weight for i, weight in zip(classes, class_weights)}
    
    # Sample weights based on class weights
    sample_weights = np.array([class_weights_dict[label] for label in y])
    
    # Define CPU-optimized XGBoost model configuration
    model = xgb.XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        min_child_weight=1,
        gamma=0.5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        tree_method='hist',
        max_bin=256,
        grow_policy='lossguide',
        eval_metric=['mlogloss', 'merror'],
        n_jobs=-1
    )
    
    # Fit model on the entire dataset
    model.fit(
        X, y,
        sample_weight=sample_weights,
        verbose=True
    )
    
    return model

here is the data which will preprocess,train,and store the predictions in submission.csv

In [45]:
train_df = pd.read_csv("train.csv")

mapping = {'low': 0, 'medium': 1, 'high': 2}

# Apply this mapping to the target column
train_df['Target'] = train_df['Target'].map(mapping)
    
# Preprocessing the data
train_processed = preprocessing(train_df, is_train=True)
    
# Preparing features and target
X = train_processed.drop(['UID', 'Target'], axis=1)
y = train_processed['Target']
    
# Train the model with entire train data
model_with_total_data = train_model(X, y)



In [47]:
def make_predictions(test_fname, predictions_fname):
    test_df=test_fname
    test_processed = preprocessing(test_df, is_train=False)
    test_X = test_processed.drop(['UID'], axis=1)
    predictions = model_with_total_data.predict(test_X)
    reverse_mapping = {0: 'low', 1: 'medium', 2: 'high'}
    predictions = [reverse_mapping[label] for label in predictions]
    submission = pd.DataFrame({
    'UID': test_df['UID'],
    'Target': predictions
    })
    submission.to_csv(predictions_fname, index=False)
    # Read a test set from the file test_fname (which will be in the same format as
    # test.csv) and write to a submission file predictions_fname.csv
    # You are not required to return anything, you will save your predictions in the same 
    # format as sample_submission.csv
