In [1]:
# STEP 1: INSTALL REQUIRED PACKAGES
!pip install -q sentence-transformers scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# STEP 2: MOUNT GOOGLE DRIVE
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install gdown



In [5]:
# Downloading your shared file using gdown
import gdown
# https://drive.google.com/file/d//view?usp=sharing
file_id = "1xav7GfJ656bCedbK1E99A9JOY9OUa0mf"
url = f"https://drive.google.com/uc?id={file_id}"
output = "data.csv"
gdown.download(url, output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1xav7GfJ656bCedbK1E99A9JOY9OUa0mf
From (redirected): https://drive.google.com/uc?id=1xav7GfJ656bCedbK1E99A9JOY9OUa0mf&confirm=t&uuid=387845b0-8c89-435f-b1f1-5c29ebaea728
To: /content/data.csv
100%|██████████| 397M/397M [00:04<00:00, 96.5MB/s]


'data.csv'

In [6]:
!pip install faiss-cpu sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [7]:
# STEP 3: IMPORT MODULES
import pandas as pd
import numpy as np
import re
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

In [8]:
# @title Default title text
# STEP 4: CLEANING CLASS
import os
from sentence_transformers import SentenceTransformer

class DataCleaner:
    def __init__(self, file_path):
        self.df = pd.read_csv(file_path)

        # Define a local directory to save the model
        local_model_path = "./sentence_transformer_model"
        # Download the model if it doesn't exist locally
        if not os.path.exists(local_model_path):
            print(f"Downloading model to {local_model_path}")
            SentenceTransformer('all-MiniLM-L6-v2').save(local_model_path)
            print("Download complete.")

        # Load the model from the local directory
        self.model = SentenceTransformer(local_model_path)

        self.original_df = self.df.copy()
        self.numeric_sector_df = pd.DataFrame()

    def remove_columns(self):
        cols_to_remove = ['latitude', 'longitude', 'QueryType', 'Category', 'BlockName']
        self.df.drop(columns=[col for col in cols_to_remove if col in self.df.columns], inplace=True)

    def remove_weather_queries(self):
        if 'QueryText' in self.df.columns:
            self.df = self.df[~self.df['QueryText'].str.contains('weather', case=False, na=False)]

    def predict_crop_from_querytext_faiss(self, threshold=0.7):
        if 'Crop' not in self.df.columns or 'QueryText' not in self.df.columns:
            return

        print("Preparing known and unknown query sets...")
        known = self.df[~self.df['Crop'].isna() & self.df['QueryText'].notna()][['QueryText', 'Crop']]
        unknown = self.df[self.df['Crop'].isna() & self.df['QueryText'].notna()][['QueryText']]

        if known.empty or unknown.empty:
            print("No known or unknown crop queries available.")
            return

        print("Encoding known queries...")
        known_embeddings = self.model.encode(known['QueryText'].tolist(), convert_to_numpy=True)
        faiss.normalize_L2(known_embeddings)

        index = faiss.IndexFlatIP(known_embeddings.shape[1])
        index.add(known_embeddings)

        print("Encoding unknown queries and predicting...")
        unknown_embeddings = self.model.encode(unknown['QueryText'].tolist(), convert_to_numpy=True)
        faiss.normalize_L2(unknown_embeddings)

        D, I = index.search(unknown_embeddings, 1)

    # Assign predicted crops based on similarity threshold
        for i, (score, idx) in enumerate(zip(D, I)):
            if score[0] >= threshold:
                predicted_crop = known.iloc[idx[0]]['Crop']
                row_index = unknown.index[i]
                self.df.at[row_index, 'Crop'] = predicted_crop


    def clean_kcc_ans(self):
        if 'KccAns' in self.df.columns:
             self.df['KccAns'] = self.df['KccAns'].astype(str).apply(lambda x: re.sub(r'\d+', '', x).strip())
             self.df = self.df[self.df['KccAns'] != '']  # Remove rows where KccAns is now empty


    def remove_numeric_sector_rows(self, sector_column='Sector'):
        if sector_column in self.df.columns:
            is_numeric = self.df[sector_column].apply(lambda x: str(x).isdigit())
            self.numeric_sector_df = self.df[is_numeric]
            self.df = self.df[~is_numeric]

    def remove_semantically_similar_queries_faiss(self, threshold=0.9):
        if 'QueryText' not in self.df.columns:
            return

        queries = self.df['QueryText'].dropna().unique().tolist()
        if len(queries) == 0:
            return

        print(f"Encoding {len(queries)} queries...")
        embeddings = self.model.encode(queries, convert_to_numpy=True)
        faiss.normalize_L2(embeddings)

        index = faiss.IndexFlatIP(embeddings.shape[1])
        index.add(embeddings)

        print("Searching for similar queries using FAISS...")
        D, I = index.search(embeddings, 2)

        to_remove = set()
        for i, (score, idxs) in enumerate(zip(D, I)):
            if score[1] >= threshold:
                to_remove.add(queries[idxs[1]])

        self.df = self.df[~self.df['QueryText'].isin(to_remove)]

    def save_files(self, cleaned_path, numeric_sector_path):
        self.df.to_csv(cleaned_path, index=False)
        if not self.numeric_sector_df.empty:
            self.numeric_sector_df.to_csv(numeric_sector_path, index=False)

In [9]:
# STEP 5: PROVIDE YOUR FILE PATH HERE
# Replace this with your own file path from Drive
file_path = '/content/drive/MyDrive/YourFolder/your_file.csv'  # <- UPDATE THIS


In [10]:
# STEP 6: RUNNING EVERYTHING
cleaner = DataCleaner("data.csv")
cleaner.remove_columns()
cleaner.remove_weather_queries()
cleaner.remove_semantically_similar_queries_faiss(threshold=0.9)
cleaner.clean_kcc_ans()
cleaner.remove_numeric_sector_rows()
cleaner.predict_crop_from_querytext_faiss(threshold=0.7)


  self.df = pd.read_csv(file_path)


Downloading model to ./sentence_transformer_model


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Download complete.
Encoding 112075 queries...
Searching for similar queries using FAISS...
Preparing known and unknown query sets...
Encoding known queries...
Encoding unknown queries and predicting...


In [12]:
# STEP 7: SAVE RESULTS
clean_path = '/content/drive/MyDrive/Punjab_cleaned_file.csv'          # <- UPDATE THIS
numeric_path = '/content/drive/MyDrive/punjab_numeric_sector.csv'     # <- UPDATE THIS
cleaner.save_files(clean_path, numeric_path)