**INSTALL AND IMPORT LIBRARIES**

In [9]:
!pip install pandas numpy scikit-learn

import pandas as pd
import numpy as np



**LOAD DATA**

In [10]:
df = pd.read_csv('leads.csv', low_memory=False, encoding='latin-1')
print(f"Shape: {df.shape}")
df.head()

Shape: (54294, 39)


Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,...,secondary_market,product_crowdfunding,round_A,round_B,round_C,round_D,round_E,round_F,round_G,round_H
0,/organization/waywire,#waywire,http://www.waywire.com,|Entertainment|Politics|Social Media|News|,News,1750000,acquired,USA,NY,New York City,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,/organization/tv-communications,&TV Communications,http://enjoyandtv.com,|Games|,Games,4000000,operating,USA,CA,Los Angeles,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,/organization/rock-your-paper,'Rock' Your Paper,http://www.rockyourpaper.org,|Publishing|Education|,Publishing,40000,operating,EST,,Tallinn,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,/organization/in-touch-network,(In)Touch Network,http://www.InTouchNetwork.com,|Electronics|Guides|Coffee|Restaurants|Music|i...,Electronics,1500000,operating,GBR,,London,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,/organization/r-ranch-and-mine,-R- Ranch and Mine,,|Tourism|Entertainment|Games|,Tourism,60000,operating,USA,TX,Dallas,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**DATA CLEANING**

In [12]:
df = pd.read_csv('leads.csv', low_memory=False, encoding='latin-1')
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Show all columns to verify
print(df.columns.tolist())
def clean_funding(x):
    if pd.isnull(x) or str(x).strip() in ['-', '']:
        return 0
    try:
        return int(str(x).replace(',', '').replace(' ', '').replace('$', ''))
    except:
        return 0

df['funding_total_usd'] = df['funding_total_usd'].apply(clean_funding)

round_cols = [col for col in df.columns if col.startswith('round_') or col in [
    'seed','venture','equity_crowdfunding','undisclosed','convertible_note','debt_financing',
    'angel','grant','private_equity','post_ipo_equity','post_ipo_debt','secondary_market','product_crowdfunding'
]]
for col in round_cols:
    if col in df.columns:
        df[col] = df[col].apply(clean_funding)

['permalink', 'name', 'homepage_url', 'category_list', 'market', 'funding_total_usd', 'status', 'country_code', 'state_code', 'region', 'city', 'funding_rounds', 'founded_at', 'founded_month', 'founded_quarter', 'founded_year', 'first_funding_at', 'last_funding_at', 'seed', 'venture', 'equity_crowdfunding', 'undisclosed', 'convertible_note', 'debt_financing', 'angel', 'grant', 'private_equity', 'post_ipo_equity', 'post_ipo_debt', 'secondary_market', 'product_crowdfunding', 'round_a', 'round_b', 'round_c', 'round_d', 'round_e', 'round_f', 'round_g', 'round_h']


**HANDLE MISSING VALUES**

In [13]:
text_cols = ['market', 'category_list', 'status', 'country_code', 'state_code', 'region', 'city', 'name']
for col in text_cols:
    if col in df.columns:
        df[col] = df[col].fillna('').astype(str).str.strip().str.lower()

if 'funding_rounds' in df.columns:
    df['funding_rounds'] = pd.to_numeric(df['funding_rounds'], errors='coerce').fillna(0).astype(int)

**FEATURE ENGINEERING**

In [14]:
# Lead Scoring
df['converted'] = (df['funding_total_usd'] > 1_000_000).astype(int)

# Intent Detection
intent_keywords = ['hiring', 'launch', 'ai', 'funding']
def detect_intent(row):
    text = f"{row.get('category_list','')} {row.get('market','')}"
    return int(any(kw in text for kw in intent_keywords))

df['intent'] = df.apply(detect_intent, axis=1)

# Company Similarity
df['text_for_similarity'] = (
    df.get('name','') + ' ' +
    df.get('category_list','') + ' ' +
    df.get('market','')
).str.strip()

**FINAL DATA**

In [15]:
print(df[['funding_total_usd', 'converted', 'intent', 'text_for_similarity']].isnull().sum())

df.to_csv('leads_cleaned.csv', index=False)
print("Cleaned and feature-ready data saved as leads_cleaned.csv")
df.head()

funding_total_usd      0
converted              0
intent                 0
text_for_similarity    0
dtype: int64
Cleaned and feature-ready data saved as leads_cleaned.csv


Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,...,round_b,round_c,round_d,round_e,round_f,round_g,round_h,converted,intent,text_for_similarity
0,/organization/waywire,#waywire,http://www.waywire.com,|entertainment|politics|social media|news|,news,1750000,acquired,usa,ny,new york city,...,0,0,0,0,0,0,0,1,1,#waywire |entertainment|politics|social media|...
1,/organization/tv-communications,&tv communications,http://enjoyandtv.com,|games|,games,4000000,operating,usa,ca,los angeles,...,0,0,0,0,0,0,0,1,0,&tv communications |games| games
2,/organization/rock-your-paper,'rock' your paper,http://www.rockyourpaper.org,|publishing|education|,publishing,40000,operating,est,,tallinn,...,0,0,0,0,0,0,0,0,0,'rock' your paper |publishing|education| publi...
3,/organization/in-touch-network,(in)touch network,http://www.InTouchNetwork.com,|electronics|guides|coffee|restaurants|music|i...,electronics,1500000,operating,gbr,,london,...,0,0,0,0,0,0,0,1,0,(in)touch network |electronics|guides|coffee|r...
4,/organization/r-ranch-and-mine,-r- ranch and mine,,|tourism|entertainment|games|,tourism,60000,operating,usa,tx,dallas,...,0,0,0,0,0,0,0,0,1,-r- ranch and mine |tourism|entertainment|game...
