In [None]:
# STEP 1: Setup
#!pip install -q pandas
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import re

In [None]:
# STEP 2: Load CSV
import os
import pandas as pd

folder_path = "/content/drive/MyDrive/Annam.ai"

# Automatically detect your _cleaned_dataset.csv file
csv_files = [f for f in os.listdir(folder_path) if f.endswith("_dataset_cleaned.csv")]
if not csv_files:
    raise Exception("No *_cleaned_dataset.csv file found in Annam.ai folder!")

file_name = csv_files[0]
file_path = os.path.join(folder_path, file_name)

df = pd.read_csv(file_path)
print(f"✅ Loaded: {file_name} with {len(df)} rows")


✅ Loaded: ARUNACHAL PRADESH_dataset_cleaned.csv with 2039 rows


In [None]:
# STEP 3: Load bad word list
# Create it if not present
bad_words_path = os.path.join(folder_path, "bad_words.txt")

# Create file if not exists
if not os.path.exists(bad_words_path):
    with open(bad_words_path, "w") as f:
        pass

# Load current list
with open(bad_words_path, "r") as f:
    unwanted_words = [line.strip().lower() for line in f.readlines() if line.strip()]

print(f"🛑 {len(unwanted_words)} unwanted words loaded.")


🛑 148 unwanted words loaded.


In [None]:

# Drop rows with NaN or completely empty or whitespace-only QueryText
df = df[df['QueryText'].notna()]
df = df[df['QueryText'].str.strip() != ""]

# Remove rows where QueryText has no alphabets (i.e., only numbers or symbols)
df = df[df['QueryText'].apply(lambda x: bool(re.search(r'[a-zA-Z]', str(x))))]

print(f"✅ Cleaned dataframe shape: {df.shape}")

✅ Cleaned dataframe shape: (2039, 10)


In [None]:
# Filter out rows where QueryText has only one word AND length ≤ 2
df = df[~df['QueryText'].apply(lambda x: len(x.strip().split()) == 1 and len(x.strip()) <= 2)]

print(f"✅ Data after removing 1-word queries with ≤2 letters: {df.shape}")


✅ Data after removing 1-word queries with ≤2 letters: (2039, 10)


In [None]:
def looks_like_gibberish(text):
    if pd.isna(text):
        return False

    text = text.strip().lower()

    # Rule 0: Remove non-alpha characters for basic analysis
    cleaned_text = re.sub(r'[^a-z\s]', '', text)

    # Rule A: Only for single word or very short phrases
    if len(cleaned_text.split()) > 2:
        return False

    # Rule B: Too short or too long
    if len(cleaned_text.replace(" ", "")) <= 2 or len(cleaned_text.replace(" ", "")) > 15:
        return True

    # Rule C: No vowels at all
    if not re.search(r'[aeiou]', cleaned_text):
        return True

    # Rule D: Repeating character patterns (like jhjhjhjh, abababa)
    if re.match(r'^(.{1,3})\1{2,}$', cleaned_text.replace(" ", "")):
        return True

    # Rule E: Mostly consonants (>80%)
    letters_only = re.sub(r'\s+', '', cleaned_text)
    consonants = re.findall(r'[bcdfghjklmnpqrstvwxyz]', letters_only)
    if len(letters_only) > 0 and len(consonants) / len(letters_only) > 0.8:
        return True

    # Rule F: Detect weird structure like "cg gfghcf"
    for word in cleaned_text.split():
        if len(word) > 3 and re.search(r'[^aeiou]{4,}', word):  # 4+ consonants together
            return True

    return False

# Apply filter
df = df[~df['QueryText'].astype(str).apply(looks_like_gibberish)]

print(f"✅ Removed gibberish-looking one-word queries. Rows remaining: {len(df)}")


✅ Removed gibberish-looking one-word queries. Rows remaining: 2015


In [None]:
# # Drop rows with only one word in QueryText
# df = df[df['QueryText'].astype(str).str.strip().str.split().apply(len) > 1]

# print(f"✅ Rows with only one word removed. Remaining rows: {len(df)}")


In [None]:
# STEP 4: Remove rows with <=2 words and containing unwanted words
def is_bad_query(text):
    if pd.isna(text): return False
    words = text.lower().split()
    if len(words) > 2: return False
    return any(word in words for word in unwanted_words)

before_count = len(df)
df_cleaned = df[~df['QueryText'].apply(is_bad_query)]
removed = before_count - len(df_cleaned)

print(f"🧹 Removed {removed} bad rows. Remaining rows: {len(df_cleaned)}")


🧹 Removed 229 bad rows. Remaining rows: 1786


In [None]:
# STEP 5: Save cleaned file
cleaned_filename = file_name.replace("_dataset_cleaned.csv", "_final_dataset.csv")
cleaned_path = os.path.join(folder_path, cleaned_filename)
df_cleaned.to_csv(cleaned_path, index=False)

print(f"✅ Cleaned data saved to: {cleaned_path}")


✅ Cleaned data saved to: /content/drive/MyDrive/Annam.ai/ARUNACHAL PRADESH_final_dataset.csv


In [None]:
# STEP 6: Generate remaining short phrases for review
remaining_short = (
    df_cleaned[df_cleaned['QueryText'].notna()]  # Remove NaNs
    .assign(QueryText=df_cleaned['QueryText'].astype(str))  # Force string type
    [lambda x: x['QueryText'].str.split().apply(len) <= 2]['QueryText']
    .unique()
    .tolist()
)


review_path = os.path.join(folder_path, "next_review_short_phrases.csv")
pd.DataFrame(remaining_short, columns=["QueryText"]).to_csv(review_path, index=False)

print(f"✅ Saved remaining short queries to: {review_path}")


print(f"🧐 Next review file saved to: {review_path}")


✅ Saved remaining short queries to: /content/drive/MyDrive/Annam.ai/next_review_short_phrases.csv
🧐 Next review file saved to: /content/drive/MyDrive/Annam.ai/next_review_short_phrases.csv


In [None]:
# Open and edit manually in Colab
bad_words_path = "/content/drive/MyDrive/Annam.ai/bad_words.txt"

# View current bad words
with open(bad_words_path, 'r') as f:
    bad_words = [line.strip() for line in f]

print("✏️ Current bad words:")
print(bad_words)

# Manually add new ones (example)
bad_words.extend(["irrelevant", "vety"])

# Save updated list
with open(bad_words_path, 'w') as f:
    f.write('\n'.join(set(bad_words)))  # remove duplicates


✏️ Current bad words:
['WEATHHER', 'vety q', 'Sound', 'disconected', 'Details', 'ACD', 'farmer', 'WEATER', 'test  cal', 'Test-Call', 'report', 'WEATHE', 'cal transpered', 'vbbs', 'pm kisan', 'diss', 'dis continive', 'update', 'DIS', 'details', 'RAQINFALL', 'IN4MATION', 'cond', 'redail', 'Disconnect', 'test cll', 'rain fall', 'INCOMPLETE', 'manipuri call', 'WEATEHER', 'weatherin', 'pm-kisan e-kyc', 'registration', 'monsoon', 'Busy', 'call', 'TestCall', 'tell', 'VET', 'wether', 'manipuri farmer', 'WEATHEER', 'manipuri q', 'weatehr', 'SAME', 'Internet', 'time pas', 'rainfal', 'same', 'hello', 'weathere', 'cal', 'not', 'farmers registration', 'remote', 'CCALL', 'weather', 'network', 'climate', 'MANSOON', 'FOECAST', 'test', 'NUMBER', 'test  call', 'tasting', 'FORECAST', 'REPORT', 'forecast', 'manipuri language', 'id', 'price', 'rain', 'WEARHER', 'How', 'monsoons', 'FORCAST', 'WAETHER', 'irrelivant cal', 'diissconected', 'tesy call', 'Thank You', 'ABOUT', 'conmd', 'vety quary', 'blank call',

In [None]:
""