In [None]:
# STEP 1: INSTALL REQUIRED PACKAGES
!pip install -q sentence-transformers scikit-learn

In [None]:
# STEP 2: MOUNT GOOGLE DRIVE
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# STEP 3: IMPORT MODULES
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

In [None]:
# STEP 4: CLEANING CLASS
class DataCleaner:
    def __init__(self, file_path):
        self.df = pd.read_csv(file_path)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.original_df = self.df.copy()
        self.numeric_sector_df = pd.DataFrame()

    def remove_columns(self):
        cols_to_remove = ['Latitude', 'Longitude', 'QueryType', 'Category', 'Block']
        self.df.drop(columns=[col for col in cols_to_remove if col in self.df.columns], inplace=True)

    def remove_weather_queries(self):
        if 'QueryText' in self.df.columns:
            self.df = self.df[~self.df['QueryText'].str.contains('weather', case=False, na=False)]

    def remove_semantically_similar_queries(self, threshold=0.9):
        if 'QueryText' not in self.df.columns:
            return
        queries = self.df['QueryText'].dropna().unique()
        embeddings = self.model.encode(queries, convert_to_tensor=True)

        to_remove = set()
        for i in range(len(queries)):
            for j in range(i + 1, len(queries)):
                sim = cosine_similarity([embeddings[i].cpu().numpy()], [embeddings[j].cpu().numpy()])[0][0]
                if sim >= threshold:
                    to_remove.add(queries[j])

        self.df = self.df[~self.df['QueryText'].isin(to_remove)]

    def clean_kcc_ans(self):
        if 'KccAns' in self.df.columns:
             self.df['KccAns'] = self.df['KccAns'].astype(str).apply(lambda x: re.sub(r'\d+', '', x).strip())
             self.df = self.df[self.df['KccAns'] != '']  # Remove rows where KccAns is now empty


    def remove_numeric_sector_rows(self, sector_column='Sector'):
        if sector_column in self.df.columns:
            is_numeric = self.df[sector_column].apply(lambda x: str(x).isdigit())
            self.numeric_sector_df = self.df[is_numeric]
            self.df = self.df[~is_numeric]

    def predict_crop_from_querytext(self, threshold=0.7):
        if 'Crop' not in self.df.columns or 'QueryText' not in self.df.columns:
            return

        known_crop_queries = self.df[~self.df['Crop'].isna()][['QueryText', 'Crop']]
        known_embeddings = self.model.encode(known_crop_queries['QueryText'].tolist(), convert_to_tensor=True)

        for i, row in tqdm(self.df[self.df['Crop'].isna()].iterrows(), total=self.df['Crop'].isna().sum()):
            query = row['QueryText']
            if pd.isna(query): continue
            query_embedding = self.model.encode([query], convert_to_tensor=True)
            sims = cosine_similarity(query_embedding.cpu(), known_embeddings.cpu())[0]
            best_idx = np.argmax(sims)
            if sims[best_idx] >= threshold:
                predicted_crop = known_crop_queries.iloc[best_idx]['Crop']
                self.df.at[i, 'Crop'] = predicted_crop

    def save_files(self, cleaned_path, numeric_sector_path):
        self.df.to_csv(cleaned_path, index=False)
        if not self.numeric_sector_df.empty:
            self.numeric_sector_df.to_csv(numeric_sector_path, index=False)

In [None]:
# STEP 5: PROVIDE YOUR FILE PATH HERE
# Replace this with your own file path from Drive
file_path = '/content/drive/MyDrive/YourFolder/your_file.csv'  # <- UPDATE THIS


In [None]:
# STEP 6: RUNNING EVERYTHING
cleaner = DataCleaner(file_path)
cleaner.remove_columns()
cleaner.remove_weather_queries()
cleaner.remove_semantically_similar_queries()
cleaner.clean_kcc_ans()
cleaner.remove_numeric_sector_rows()
cleaner.predict_crop_from_querytext()


In [None]:
# STEP 7: SAVE RESULTS
clean_path = '/content/drive/MyDrive/YourFolder/cleaned_file.csv'          # <- UPDATE THIS
numeric_path = '/content/drive/MyDrive/YourFolder/numeric_sector.csv'     # <- UPDATE THIS
cleaner.save_files(clean_path, numeric_path)