In [33]:
import pandas as pd

# Load the train and test datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Display the first few rows of each dataset to confirm loading
print("Train Data:")
print(train_data.head())
print("\nTest Data:")
print(test_data.head())

Train Data:
      UID  AgriculturalPostalZone  AgricultureZoningCode  \
0   12998                  291674                    0.0   
1   20860                  164397                   28.0   
2   75725                  616532                    0.0   
3  106521                  942111                   43.0   
4   99467                  475557                   38.0   

   CropFieldConfiguration  CropSpeciesVariety  CultivatedAndWildArea  \
0                     NaN                 3.0                    NaN   
1                     NaN                 4.0                    NaN   
2                     NaN                 2.0                    NaN   
3                     NaN                 7.0                    NaN   
4                     NaN                 3.0                    NaN   

   CultivatedAreaSqft1  DistrictId  FarmClassification  FarmEquipmentArea  \
0               1136.0         1.0                 NaN                NaN   
1               2083.0         1.0      

In [34]:
train_data['DistrictId'].fillna(train_data['NationalRegionCode'],inplace = True)
test_data['DistrictId'].fillna(test_data['NationalRegionCode'],inplace = True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['DistrictId'].fillna(train_data['NationalRegionCode'],inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['DistrictId'].fillna(test_data['NationalRegionCode'],inplace = True)


In [35]:
def code(ag_id):
  if (ag_id < 20 or (ag_id > 26 and ag_id < 38) or (ag_id>41 and ag_id<46)):
    return 1
  elif ag_id > 19 and ag_id < 27:
    return 2
  else:
    return 3

train_data['DistrictId'] = train_data['DistrictId'].fillna(train_data['AgricultureZoningCode'].apply(code))
test_data['DistrictId'] = test_data['DistrictId'].fillna(test_data['AgricultureZoningCode'].apply(code))

In [36]:
train_data = train_data.drop(['NationalRegionCode'], axis = 1)
test_data = test_data.drop(['NationalRegionCode'], axis = 1)

In [37]:
# Map 'Target' column to numerical values
target_mapping = {'low': 0, 'medium': 1, 'high': 2}
train_data['Target'] = train_data['Target'].map(target_mapping)

# Check the first few rows of train_data to confirm changes
print("Updated Target column:")
print(train_data[['Target']].head())

# Check data types to ensure correct formatting
print("\nData types in train data:")
print(train_data.dtypes)


Updated Target column:
   Target
0       2
1       1
2       1
3       0
4       1

Data types in train data:
UID                               int64
AgriculturalPostalZone            int64
AgricultureZoningCode           float64
CropFieldConfiguration          float64
CropSpeciesVariety              float64
CultivatedAndWildArea           float64
CultivatedAreaSqft1             float64
DistrictId                      float64
FarmClassification              float64
FarmEquipmentArea               float64
FarmShedAreaSqft                float64
FarmVehicleCount                float64
FarmingCommunityId              float64
FarmingUnitCount                float64
FieldConstructionType           float64
FieldEstablishedYear            float64
FieldShadeCover                 float64
FieldSizeSqft                   float64
FieldZoneLevel                  float64
HarvestProcessingType           float64
HarvestStorageSqft              float64
HasGreenHouse                   float64
HasPestCon

In [38]:
def drop_columns_with_missing_values(data, threshold=0.11):
    # Calculate missing values percentage for each column
    missing_percentage = data.isnull().mean()
    # Identify columns to drop (those with missing values above the threshold)
    columns_to_drop = missing_percentage[missing_percentage > threshold].index
    # Drop the identified columns
    return columns_to_drop

# Get the columns to drop from train_data
columns_to_drop_train = drop_columns_with_missing_values(train_data)

# Drop the same columns from train_data
train_data = train_data.drop(columns=columns_to_drop_train)

# Drop the same columns from test_data
test_data = test_data.drop(columns=columns_to_drop_train)

In [39]:
X_train = train_data.drop(['Target'], axis=1)
y_train = train_data['Target']
X_test = test_data

In [40]:

from sklearn.impute import SimpleImputer

# Define categorical and numerical columns
categorical_cols = ['SoilFertilityType', 'ReservoirType', 'LandUsageType', 'CropSpeciesVariety',
                    'FieldShadeCover', 'HasPestControl', 'ReservoirWithFilter', 'NaturalLakePresence',
                    'HasGreenHouse', 'TaxOverdueStatus', 'AgricultureZoningCode', 'OtherZoningCode',
                    'TypeOfIrrigationSystem', 'CropFieldConfiguration', 'FarmClassification',
                    'HarvestProcessingType', 'FieldZoneLevel', 'FieldConstructionType']

numerical_cols = ['UndergroundStorageSqft', 'WaterAccessPoints', 'WaterAccessPointsCalc',
                  'PrimaryCropAreaSqft', 'TotalCultivatedAreaSqft', 'CultivatedAreaSqft1',
                  'PerimeterGuardPlantsArea', 'PrimaryCropareainsqft2', 'CultivatedAndWildArea',
                  'TotalAreaSqft', 'NationalRegionCode', 'NumberGreenHouses', 'MainIrrigationSystemCount',
                  'FarmVehicleCount', 'FarmEquipmentArea', 'Latitude', 'Longitude', 'FieldSizeSqft',
                  'WaterReservoirCount', 'TotalReservoirSize', 'StorageAndFacilityCount',
                  'PartialIrrigationSystemCount', 'FarmingUnitCount', 'FarmShedAreaSqft',
                  'HarvestStorageSqft', 'FieldEstablishedYear', 'NumberOfFarmingZones', 'TaxAgrarianValue',
                  'TaxLandValue', 'TotalValue', 'TotalTaxAssessed', 'ValuationYear', 'TaxOverdueYear']

# Update numerical and categorical column lists after dropping columns with missing values
numerical_cols = [col for col in numerical_cols if col in X_train.columns]
categorical_cols = [col for col in categorical_cols if col in X_train.columns]

# Impute categorical features with the most frequent value (mode) from the training data
categorical_imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = categorical_imputer.fit_transform(X_train[categorical_cols])

# Impute numerical features with the mean from the training data
numerical_imputer = SimpleImputer(strategy='mean')
X_train[numerical_cols] = numerical_imputer.fit_transform(X_train[numerical_cols])

# Now use the same imputers to transform X_test
X_test[categorical_cols] = categorical_imputer.transform(X_test[categorical_cols])
X_test[numerical_cols] = numerical_imputer.transform(X_test[numerical_cols])



In [41]:
from xgboost import XGBClassifier
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.array([0, 1, 2]), y=y_train)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# Convert class weights to sample weights for each instance in y_train
sample_weights = y_train.map(class_weights_dict).values

# Create and fit the XGBoost model with sample weights
model = XGBClassifier(objective='multi:softmax', num_class=3)
model.fit(X_train, y_train, sample_weight=sample_weights)

# Make predictions on X_test
y_pred = model.predict(X_test)


In [42]:

target_mapping = {0:'low',1: 'medium' , 2:'high'}
y_pred = pd.Series(y_pred)

y_pred= y_pred.map(target_mapping)

In [43]:
uid_column = X_test['UID']  # Assuming 'UID' is a column in X_test

# Combine UID with the predicted labels y_pred
results = pd.DataFrame({'UID': uid_column, 'Target': y_pred})

# Save to CSV file
results.to_csv('predictions.csv', index=False)

