In [123]:
from sklearn.svm import SVR
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler

In [124]:
dataset = pd.read_csv('Train.csv')

In [125]:
dataset['BHK_OR_RK'].replace({'BHK':0,'RK':1},inplace = True)
dataset['POSTED_BY'].replace({'Dealer':0,'Owner':1,'Builder':2},inplace = True)

In [126]:
dataset.drop(['LONGITUDE','LATITUDE','BHK_OR_RK'],axis = 1,inplace = True)

In [127]:
dataset.shape

(29451, 9)

In [128]:
import pandas as pd

# Step 2: Identify the addresses with value_count > 50
address_counts = dataset['ADDRESS'].value_counts()
addresses_above_50 = address_counts[address_counts > 50].index

# Step 3: Randomly select 50 addresses from the list
rel_address = addresses_above_50.to_series().sample(n=40, random_state=42).tolist()

# Step 4: For each of these 50 addresses, select 40 rows
final_dataset = pd.DataFrame()

for address in rel_address:
    address_rows = dataset[dataset['ADDRESS'] == address].sample(n=50, random_state=42)
    final_dataset = pd.concat([final_dataset, address_rows])

    dataset = dataset.drop(address_rows.index)

final_dataset.reset_index(drop=True, inplace=True)


In [129]:
dataset.shape

(27451, 9)

In [130]:
final_dataset.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,TARGET(PRICE_IN_LACS)
0,1,0,0,2,1400.037334,1,1,"Sector-77 Noida,Noida",75.0
1,0,0,0,2,1000.0,1,1,"Sector-77 Noida,Noida",57.0
2,0,0,0,4,2104.581512,1,1,"Sector-77 Noida,Noida",130.0
3,0,0,1,3,2200.0,1,1,"Sector-77 Noida,Noida",110.0
4,1,0,0,3,1870.207593,1,1,"Sector-77 Noida,Noida",100.0


In [131]:
final_dataset.shape

(2000, 9)

In [132]:
# Step 1: Filter the test_dataset to include only rows with addresses in addresses_above_40
filtered_test_df = dataset[dataset['ADDRESS'].isin(rel_address)]

# Step 2: Randomly select 100 rows from the filtered DataFrame
test_sample = filtered_test_df.sample(n=100, random_state=42)

# Reset index of the test sample
test_sample.reset_index(drop=True, inplace=True)

In [133]:
final_dataset['ADDRESS'].value_counts()

ADDRESS
Sector-77 Noida,Noida                 50
Ghodbunder Road,Mumbai                50
New Town,Kolkata                      50
Gomti Nagar Extension,Lucknow         50
Kundli,Sonipat                        50
Malad (West),Lalitpur                 50
Rajaji Nagar,Bangalore                50
Kolshet Road,Mumbai                   50
Akshaya Nagar,Bangalore               50
Sector 82 Faridabad,Faridabad         50
Palanpur,Surat                        50
Thane West,Lalitpur                   50
Sector-78 Noida,Noida                 50
NIBM,Pune                             50
Wakad,Pune                            50
Sector-74 Noida,Noida                 50
Mansarovar Extension,Jaipur           50
Sector-75 Noida,Noida                 50
Vaibhav Khand,Ghaziabad               50
Alwar Bypass Road,Bhiwadi             50
Rajarhat,Kolkata                      50
Noida Extension,Noida                 50
Sector-168 Noida,Noida                50
Zirakpur,Chandigarh                   50
Kharghar

In [134]:
test_sample.shape

(100, 9)

In [135]:
test_sample

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,TARGET(PRICE_IN_LACS)
0,0,0,0,5,4612.850082,1,1,"Sector-137 Noida,Noida",140.0
1,0,0,0,3,1522.007404,1,1,"New Town,Kolkata",74.0
2,0,1,1,2,1120.259481,0,1,"Zirakpur,Chandigarh",44.9
3,0,0,0,3,1688.596491,1,1,"Zirakpur,Chandigarh",53.9
4,1,0,0,2,831.024931,1,1,"New Town,Kolkata",45.0
...,...,...,...,...,...,...,...,...,...
95,0,0,1,2,1305.158484,1,1,"Zirakpur,Chandigarh",42.0
96,0,1,1,3,1021.461998,0,0,"Zirakpur,Chandigarh",63.3
97,1,0,1,3,1501.812532,1,1,"NIBM,Pune",87.0
98,0,0,1,1,610.066090,1,1,"Zirakpur,Chandigarh",24.0


In [136]:
from sklearn.preprocessing import LabelEncoder

In [137]:
label_encoder = LabelEncoder()

# Step 3: Fit the LabelEncoder on combined dataset
label_encoder.fit(final_dataset['ADDRESS'])

# Step 4: Transform both train and test datasets
final_dataset['encoded_address'] = label_encoder.transform(final_dataset['ADDRESS'])
test_sample['encoded_address'] = label_encoder.transform(test_sample['ADDRESS'])

final_dataset.drop('ADDRESS', axis=1, inplace=True)
test_sample.drop('ADDRESS', axis=1, inplace=True)

In [138]:
final_dataset.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,TARGET(PRICE_IN_LACS),encoded_address
0,1,0,0,2,1400.037334,1,1,75.0,33
1,0,0,0,2,1000.0,1,1,57.0,33
2,0,0,0,4,2104.581512,1,1,130.0,33
3,0,0,1,3,2200.0,1,1,110.0,33
4,1,0,0,3,1870.207593,1,1,100.0,33


In [139]:
test_sample.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,TARGET(PRICE_IN_LACS),encoded_address
0,0,0,0,5,4612.850082,1,1,140.0,29
1,0,0,0,3,1522.007404,1,1,74.0,21
2,0,1,1,2,1120.259481,0,1,44.9,39
3,0,0,0,3,1688.596491,1,1,53.9,39
4,1,0,0,2,831.024931,1,1,45.0,21


In [140]:
final_dataset.to_csv('final_dataset.csv',index=False)
test_sample.to_csv('test_sample.csv',index=False)