In [1]:
!pip install scikit-learn
import pandas as pd
import numpy as np
import random




[notice] A new release of pip is available: 24.1.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Random seed for reproducibility
np.random.seed(42)

# Possible values
states = ["Kerala","Haryana","Punjab","Gujarat","Bihar","Tamil Nadu","Uttar Pradesh","Rajasthan","Maharashtra","Karnataka"]
castes = ["General","OBC","SC","ST"]
religions = ["Hindu","Muslim","Christian","Sikh"]
occupations = ["Govt Job","Private Job","Business","Farmer","Unemployed"]
marriage_types = ["Arranged","Love"]
areas = ["Urban","Rural"]
families = ["Joint","Nuclear"]

data = []
n = 15000  # Number of rows

for _ in range(n):
    state = random.choice(states)
    caste = random.choice(castes)
    religion = random.choice(religions)
    groom_age = random.randint(21, 35)
    bride_age = random.randint(18, 30)
    groom_edu = random.randint(10, 20)  # years
    bride_edu = random.randint(8, 20)
    groom_income = random.randint(10000, 150000)  # monthly
    bride_income = random.randint(0, 80000)
    occupation = random.choice(occupations)
    marriage_type = random.choice(marriage_types)
    area = random.choice(areas)
    family_type = random.choice(families)
    
    # Base dowry calculation (synthetic, research-inspired pattern)
    dowry = groom_income * 10  
    if caste == "General": dowry *= 1.3
    elif caste == "OBC": dowry *= 1.1
    elif caste == "SC": dowry *= 0.85
    elif caste == "ST": dowry *= 0.7

    if marriage_type == "Love": dowry *= 0.4
    if occupation == "Govt Job": dowry *= 1.5
    elif occupation == "Business": dowry *= 1.3
    elif occupation == "Farmer": dowry *= 0.9
    elif occupation == "Unemployed": dowry *= 0.6

    if state in ["Kerala","Haryana","Punjab"]:
        dowry *= 1.2

    dowry = int(dowry + np.random.normal(0, 50000))  # add noise
    
    data.append([state,caste,religion,groom_age,bride_age,groom_edu,bride_edu,
                 groom_income,bride_income,occupation,marriage_type,area,family_type,dowry])

# Create DataFrame
columns = ["State","Caste","Religion","Groom_Age","Bride_Age","Groom_Education_Yrs","Bride_Education_Yrs",
           "Groom_Income_Monthly","Bride_Income_Monthly","Occupation","Marriage_Type","Area","Family_Type","Dowry_Amount_INR"]

df = pd.DataFrame(data, columns=columns)

# Save to CSV
df.to_csv("indian_dowry_dataset.csv", index=False)

print("Dataset saved as 'indian_dowry_dataset.csv' with", len(df), "rows")


Dataset saved as 'indian_dowry_dataset.csv' with 15000 rows


In [3]:
dowry_data = pd.read_csv("indian_dowry_dataset.csv")
dowry_data[(dowry_data["State"] == "Uttar Pradesh") & (dowry_data["Caste"] == "General")]

Unnamed: 0,State,Caste,Religion,Groom_Age,Bride_Age,Groom_Education_Yrs,Bride_Education_Yrs,Groom_Income_Monthly,Bride_Income_Monthly,Occupation,Marriage_Type,Area,Family_Type,Dowry_Amount_INR
10,Uttar Pradesh,General,Sikh,34,21,17,9,102442,15171,Farmer,Love,Rural,Joint,456257
11,Uttar Pradesh,General,Christian,23,24,15,9,33628,42411,Govt Job,Arranged,Urban,Nuclear,632459
46,Uttar Pradesh,General,Muslim,32,23,19,15,13437,38183,Farmer,Arranged,Rural,Nuclear,134180
138,Uttar Pradesh,General,Hindu,24,27,13,13,94784,34059,Unemployed,Arranged,Rural,Nuclear,779991
173,Uttar Pradesh,General,Christian,28,29,11,16,66607,15064,Farmer,Arranged,Rural,Nuclear,796359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14905,Uttar Pradesh,General,Sikh,21,22,13,11,90570,21180,Unemployed,Arranged,Urban,Nuclear,763093
14917,Uttar Pradesh,General,Muslim,25,23,15,15,127829,51832,Unemployed,Arranged,Rural,Nuclear,1089945
14921,Uttar Pradesh,General,Muslim,23,23,17,14,126748,12309,Unemployed,Arranged,Urban,Nuclear,877296
14930,Uttar Pradesh,General,Hindu,22,26,13,20,72443,27485,Farmer,Arranged,Urban,Nuclear,969382


In [4]:
min_age = dowry_data["Bride_Age"].min()
dowry_data[dowry_data["Bride_Age"] == min_age]

Unnamed: 0,State,Caste,Religion,Groom_Age,Bride_Age,Groom_Education_Yrs,Bride_Education_Yrs,Groom_Income_Monthly,Bride_Income_Monthly,Occupation,Marriage_Type,Area,Family_Type,Dowry_Amount_INR
5,Karnataka,OBC,Sikh,29,18,11,17,146345,28136,Unemployed,Arranged,Urban,Joint,954170
54,Uttar Pradesh,SC,Sikh,27,18,17,17,21958,49054,Unemployed,Love,Urban,Nuclear,96344
55,Bihar,ST,Muslim,24,18,19,12,61372,58552,Unemployed,Arranged,Rural,Joint,304326
64,Uttar Pradesh,SC,Sikh,34,18,13,13,42724,67059,Private Job,Arranged,Rural,Joint,403780
68,Rajasthan,ST,Sikh,33,18,18,19,38960,23007,Govt Job,Love,Urban,Joint,181713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14934,Maharashtra,SC,Sikh,25,18,14,12,97341,36266,Govt Job,Love,Rural,Joint,551647
14960,Punjab,OBC,Sikh,33,18,18,13,43580,37595,Unemployed,Arranged,Urban,Nuclear,376600
14968,Bihar,ST,Christian,31,18,10,15,30653,59084,Farmer,Arranged,Urban,Nuclear,208695
14974,Karnataka,SC,Muslim,29,18,20,17,80730,37468,Business,Love,Urban,Nuclear,409579


In [5]:
max_dowry = dowry_data["Dowry_Amount_INR"].max()
max_dowry

np.int64(3492070)

In [6]:
dowry_data[dowry_data["Dowry_Amount_INR"] == max_dowry]

Unnamed: 0,State,Caste,Religion,Groom_Age,Bride_Age,Groom_Education_Yrs,Bride_Education_Yrs,Groom_Income_Monthly,Bride_Income_Monthly,Occupation,Marriage_Type,Area,Family_Type,Dowry_Amount_INR
14275,Haryana,General,Sikh,35,24,18,17,148141,53710,Govt Job,Arranged,Rural,Nuclear,3492070


In [7]:
# prepare our data
X = dowry_data.drop("Dowry_Amount_INR", axis = 1)
y = dowry_data["Dowry_Amount_INR"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35)

In [8]:
# Setting up estimator
from sklearn.linear_model import Lasso

model = Lasso()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

model.score(X_test, y_test)

ValueError: could not convert string to float: 'Maharashtra'

In [9]:
dowry_data.dtypes

State                   object
Caste                   object
Religion                object
Groom_Age                int64
Bride_Age                int64
Groom_Education_Yrs      int64
Bride_Education_Yrs      int64
Groom_Income_Monthly     int64
Bride_Income_Monthly     int64
Occupation              object
Marriage_Type           object
Area                    object
Family_Type             object
Dowry_Amount_INR         int64
dtype: object

In [10]:
dowry_data["Occupation"].value_counts(), dowry_data["Religion"].value_counts()

(Occupation
 Private Job    3062
 Farmer         3020
 Business       3005
 Govt Job       2991
 Unemployed     2922
 Name: count, dtype: int64,
 Religion
 Hindu        3771
 Sikh         3753
 Christian    3752
 Muslim       3724
 Name: count, dtype: int64)

In [11]:
dowry_data["State"].value_counts()

State
Karnataka        1564
Punjab           1535
Haryana          1532
Gujarat          1526
Uttar Pradesh    1521
Kerala           1499
Rajasthan        1488
Maharashtra      1464
Bihar            1459
Tamil Nadu       1412
Name: count, dtype: int64

In [12]:
dowry_data.isna().sum()

State                   0
Caste                   0
Religion                0
Groom_Age               0
Bride_Age               0
Groom_Education_Yrs     0
Bride_Education_Yrs     0
Groom_Income_Monthly    0
Bride_Income_Monthly    0
Occupation              0
Marriage_Type           0
Area                    0
Family_Type             0
Dowry_Amount_INR        0
dtype: int64

In [13]:
# data that need conversion: Area, Family_Type, Marriage_Type, Occupation, Religion, Caste, State, 

In [47]:
# Turn the categories into numbers as only this is understood by machines

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Area", "Family_Type", "Marriage_Type", "Occupation", "Religion", "Caste", "State"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

array([[1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.40000e+01,
        1.39882e+05, 5.21040e+04],
       [0.00000e+00, 1.00000e+00, 1.00000e+00, ..., 1.00000e+01,
        6.35540e+04, 6.56920e+04],
       [0.00000e+00, 1.00000e+00, 1.00000e+00, ..., 1.40000e+01,
        5.32860e+04, 3.23410e+04],
       ...,
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+01,
        7.09160e+04, 1.25140e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.60000e+01,
        3.62360e+04, 2.28740e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.20000e+01,
        1.21782e+05, 3.33830e+04]])

In [48]:
# Making train_test_split again
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.3)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

model.score(X_test, y_test)

0.9814885754955763

In [16]:
# Let's try another model

In [49]:
from sklearn.ensemble import RandomForestRegressor

model2 = RandomForestRegressor()

model2.fit(X_train, y_train)

y_pred2 = model2.predict(X_test)

model2.score(X_test, y_test)

0.9814469003250434

In [18]:
### This seems fine for now! Let save the model

In [50]:
from joblib import load, dump
# SAVE THE model
dump(model2, filename = ("dowry_predict_model.joblib"))
# Also saving the encoder
dump(transformer, filename = ("./dowry_encoder.joblib"))

['./dowry_encoder.joblib']

In [51]:
# Now we can Load The Data when needed.

In [52]:
dowry_data.columns

Index(['State', 'Caste', 'Religion', 'Groom_Age', 'Bride_Age',
       'Groom_Education_Yrs', 'Bride_Education_Yrs', 'Groom_Income_Monthly',
       'Bride_Income_Monthly', 'Occupation', 'Marriage_Type', 'Area',
       'Family_Type', 'Dowry_Amount_INR'],
      dtype='object')

In [53]:
# Let's take user's i/p and reform it to be passed to the model
transformer = load(filename=("./dowry_encoder.joblib"))
def user_input(State, Caste, Religion, Groom_Age, Bride_Age, Groom_Education_Yrs, Bride_Education_Yrs, Groom_Income_Monthly, Bride_Income_Monthly, Occupation, Marriage_Type, Area, Family_Type):
    user_data = {"State" : State, 
             "Caste" : Caste,
             "Religion" : Religion,
             "Groom_Age" : Groom_Age,
             "Bride_Age" : Bride_Age,
             "Groom_Education_Yrs" : Groom_Education_Yrs,
             "Bride_Education_Yrs" : Bride_Education_Yrs,
             "Groom_Income_Monthly" : Groom_Income_Monthly,
             "Bride_Income_Monthly" : Bride_Income_Monthly,
             "Occupation" : Occupation,
             "Marriage_Type" : Marriage_Type,
             "Area" : Area,
             "Family_Type" : Family_Type
            },
    data = pd.DataFrame(data = user_data)

    # Encoding the necessary features using the dowry_encoder.joblib
    data = transformer.transform(data)
    return data

# Now defining a func to predict user's data on the model
def model_predict(data, model):
    y_pred = model.predict(data)
    return y_pred
    

In [54]:
from joblib import load
model = load(filename=("dowry_predict_model.joblib"))

In [56]:
# Test the model

data = user_input("Bihar", "General", "Hindu", 26, 18, 23, 20, 600000, 200000, "Private Job", "Arranged", "Urban", "Joint")
y_pred = model_predict(data, model)
print(y_pred)

[1898243.27]


In [57]:
### AMAZING!!!!

In [60]:
data = user_input("Uttar Pradesh", "General", "Hindu", 25, 23, 24, 22, 600000, 200000, "Private Job", "Arranged", "Urban", "Joint")
y_pred = model_predict(data, model)
print(y_pred)

[1901698.45]


In [59]:
dowry_data["Occupation"].value_counts()

Occupation
Private Job    3062
Farmer         3020
Business       3005
Govt Job       2991
Unemployed     2922
Name: count, dtype: int64