<a href="https://colab.research.google.com/github/yogeshHax/Bid_Dragon_hatchling/blob/main/data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# data_pipeline.py

#   Train / Test data loading, cleaning, vectorization, Random Forest reasoning, Over-fitting & under-fitting, validation and tuning

In [2]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train (1).csv
Saving test.csv to test.csv


In [11]:
!pip install pandas scikit-learn numpy



In [13]:
import sklearn

In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , classification_report

In [3]:
# text cleaning
def clean_text(text):
    if pd.isna(text):
        return " "
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [4]:
# load data
def load_data(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    if "text" not in train_df.columns or "label" not in train_df.columns:
        raise ValueError("train.csv must contain 'text' and 'label' columns")

    if "text" not in test_df.columns:
        raise ValueError("test.csv must contain 'text' column")

    return train_df, test_df

In [5]:
# preprocess
def preprocess(train_df,test_df):
    train_df["clean_text"] = train_df["text"].apply(clean_text)
    test_df["clean_text"] = test_df["text"].apply(clean_text)
    train_df[train_df["clean_text"].str.len() > 20]
    train_df = train_df.drop_duplicates(subset=["clean_text", "label"])

    return train_df, test_df


In [6]:
#vectorize
def vectorize(train_text, test_text):
    vectorizer = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        stop_words="english"
    )

    X_train_full = vectorizer.fit_transform(train_text)
    X_test = vectorizer.transform(test_text)

    return X_train_full, X_test, vectorizer

In [7]:
# split data
def split_data(X, y):
    return train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

In [16]:
# Model tuning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
def train_random_forest(X_train, y_train):
  rf = RandomForestClassifier(random_state=42,n_jobs=-1)
  param_grid = {
        "n_estimators": [100, 200],
        "max_depth": [8, 12, 16],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    }

  grid = GridSearchCV(
          rf,
          param_grid,
          cv=3,
          scoring="f1",
          n_jobs=-1,
          verbose=1
      )

  grid.fit(X_train, y_train)
  return grid.best_estimator_,
  grid.best_params_

In [17]:
# evaluating the model

def evaluate(model, X_train, y_train, X_val, y_val):
    train_acc = accuracy_score(y_train, model.predict(X_train))
    val_acc = accuracy_score(y_val, model.predict(X_val))

    print("\n--- PERFORMANCE ---")
    print(f"Train Accuracy: {train_acc:.4f}")
    print(f"Validation Accuracy: {val_acc:.4f}")

    if train_acc - val_acc > 0.1:
        print("Overfitting detected")
    elif val_acc < 0.6:
        print("Underfitting detected")
    else:
        print("Good generalization")

    print("\nValidation Classification Report:")
    print(classification_report(y_val, model.predict(X_val)))