<a href="https://colab.research.google.com/github/yasirsid2004/upskillcampus/blob/main/upskillcampus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import os

print("--- Libraries imported successfully! ---")

--- Libraries imported successfully! ---


In [None]:
try:
    # Load the dataset with cost and yield information
    df_cost = pd.read_csv('/content/datafile (1).csv')
    print("Successfully loaded datafile (1).csv")

    # Load the dataset with variety and zone information
    df_variety = pd.read_csv('/content/datafile (3).csv')
    print("Successfully loaded datafile (3).csv")

    # --- Step 1: Clean the first dataset (df_cost) ---
    df_cost.columns = df_cost.columns.str.strip().str.lower()
    df_cost = df_cost.rename(columns={
        'cost of cultivation (`/hectare) a2+fl': 'cost_cultivation_a2fl',
        'cost of cultivation (`/hectare) c2': 'cost_cultivation_c2',
        'cost of production (`/quintal) c2': 'cost_production_c2',
        'yield (quintal/ hectare)': 'yield' # This will be our target to predict
    })
    print("\nCleaned column names for datafile (1).csv")

    # --- Step 2: Clean the second dataset (df_variety) ---
    df_variety.columns = df_variety.columns.str.strip().str.lower()
    if '' in df_variety.columns: # Remove the empty column if it exists
        df_variety = df_variety.drop(columns=[''])
    # Keep only the first entry for each crop to avoid creating messy duplicates
    df_variety = df_variety.drop_duplicates(subset=['crop'], keep='first')
    print("Cleaned column names for datafile (3).csv")

    # --- Step 3: Merge the two datasets ---
    # We combine the two tables using the 'crop' column as the common link.
    df = pd.merge(df_cost, df_variety, on='crop', how='left')
    print("\n--- Successfully merged the two datasets! ---")

except FileNotFoundError as e:
    print(f"\nError: A file was not found. Please make sure both 'datafile (1).csv' and 'datafile (3).csv' are uploaded to the main '/content/' directory.")
    print(f"Missing file: {e.filename}")
    exit()
except Exception as e:
    print(f"\nAn error occurred: {e}")
    exit()

Successfully loaded datafile (1).csv
Successfully loaded datafile (3).csv

Cleaned column names for datafile (1).csv
Cleaned column names for datafile (3).csv

--- Successfully merged the two datasets! ---


In [None]:
print("\n--- First 5 rows of the new merged dataset ---")
print(df.head())

print("\n--- Basic Information about the dataset ---")
df.info()

print("\n--- Checking for Missing (empty) Values ---")
print(df.isnull().sum())


--- First 5 rows of the new merged dataset ---
    crop           state  cost_cultivation_a2fl  cost_cultivation_c2  \
0  ARHAR   Uttar Pradesh                9794.05             23076.74   
1  ARHAR       Karnataka               10593.15             16528.68   
2  ARHAR         Gujarat               13468.82             19551.90   
3  ARHAR  Andhra Pradesh               17051.66             24171.65   
4  ARHAR     Maharashtra               17130.55             25270.26   

   cost_production_c2  yield variety season/ duration in days  \
0             1941.55   9.83     NaN                      NaN   
1             2172.46   7.47     NaN                      NaN   
2             1898.30   9.59     NaN                      NaN   
3             3670.54   6.42     NaN                      NaN   
4             2775.80   8.72     NaN                      NaN   

  recommended zone  unnamed: 4  
0              NaN         NaN  
1              NaN         NaN  
2              NaN         Na

In [None]:
print("\n--- Preparing data for modeling... ---")

# --- FIX: Remove duplicate columns ---
# This error can happen if the merged file has duplicate column names
# (e.g., 'crop' and 'crop'). This line keeps only the first occurrence of each.
original_col_count = len(df.columns)
df = df.loc[:,~df.columns.duplicated(keep='first')]
new_col_count = len(df.columns)

if original_col_count != new_col_count:
    print(f"Removed {original_col_count - new_col_count} duplicate columns.")
print("Current columns:", df.columns.tolist())
# --- End of Fix ---


# Step 1: Fill missing values (Robust Loop)
# This new loop is safer. It iterates through the unique column names
# and uses a different check for data type that won't fail if
# duplicates somehow still exist.
print("Filling missing values...")
for col_name in df.columns:
    if pd.api.types.is_object_dtype(df[col_name]):
        # For text columns, check if mode() is empty (all NaN)
        if not df[col_name].mode().empty:
            # Fill with the most common value
            df[col_name] = df[col_name].fillna(df[col_name].mode()[0])
        else:
            # If all values were NaN, fill with a placeholder
            df[col_name] = df[col_name].fillna("Unknown")
    else:
        # For number columns, fill with the median
        df[col_name] = df[col_name].fillna(df[col_name].median())
print("Missing values have been filled.")


# Step 2: Convert text columns to numbers using Label Encoding
print("Converting text columns to numbers...")
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
print("Text columns have been converted to numbers.")

print("\n--- Data after cleaning and encoding ---")
print(df.head())



--- Preparing data for modeling... ---
Current columns: ['crop', 'state', 'cost_cultivation_a2fl', 'cost_cultivation_c2', 'cost_production_c2', 'yield', 'variety', 'season/ duration in days', 'recommended zone', 'unnamed: 4']
Filling missing values...
Missing values have been filled.
Converting text columns to numbers...
Text columns have been converted to numbers.

--- Data after cleaning and encoding ---
   crop  state  cost_cultivation_a2fl  cost_cultivation_c2  \
0     0     11                9794.05             23076.74   
1     0      4               10593.15             16528.68   
2     0      2               13468.82             19551.90   
3     0      0               17051.66             24171.65   
4     0      6               17130.55             25270.26   

   cost_production_c2  yield  variety  season/ duration in days  \
0             1941.55   9.83        0                         0   
1             2172.46   7.47        0                         0   
2             1

In [None]:
features = ['crop', 'state', 'cost_cultivation_c2', 'variety', 'season/ duration in days', 'recommended zone']
target = 'yield'

try:
    X = df[features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f"\nData has been split successfully:")
    print(f"- Training set has {X_train.shape[0]} samples.")
    print(f"- Testing set has {X_test.shape[0]} samples.")

except KeyError as e:
    print(f"\nError selecting features: {e}")
    print("This means a column name in the 'features' list is incorrect. Please double-check the column names printed in Cell 3.")
    exit()



Data has been split successfully:
- Training set has 39 samples.
- Testing set has 10 samples.


In [None]:
print("\n--- Training the prediction model... ---")
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("Model training is complete!")



--- Training the prediction model... ---
Model training is complete!


In [None]:
print("\n--- Evaluating the model's performance ---")
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R²) Score: {r2:.2f}")

importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\n--- Most Important Features for Prediction ---")
print(importance_df)


--- Evaluating the model's performance ---
Mean Absolute Error (MAE): 36.23
R-squared (R²) Score: 0.96

--- Most Important Features for Prediction ---
                    Feature  Importance
2       cost_cultivation_c2    0.837139
0                      crop    0.137239
1                     state    0.025622
3                   variety    0.000000
4  season/ duration in days    0.000000
5          recommended zone    0.000000
