In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('Bengaluru_House_Data.csv')

df.drop(columns=['area_type','area_type','balcony','availability','society'], inplace=True)

df['size'] = df['size'].fillna('2 BHK')
df['location'] = df['location'].fillna('Whitefield')
df['bath'].fillna(df['bath'].median(),inplace=True)

df['BHK'] = df['size'].str.split().str.get(0).astype(int)

def convert_range(x):
    temp = x.split('-')
    if len(temp) == 2:
         return (float(temp[0]) + float(temp[1]))/2
    try:
        return float(x)
    except:
        return None
    
df['total_sqft'] = df['total_sqft'].apply(convert_range)

df['price_per_sqft'] = (df['price'] * 100000) / df['total_sqft']

df['location'] = df.location.apply(lambda x: x.strip())
location_counts = df.location.value_counts()

location_counts = location_counts[location_counts <= 10]

df['location'] = df['location'].apply(lambda x: 'others' if x in location_counts else x)

def remove_outlier_sqft(df):
    df_output = pd.DataFrame()
    for key,subdf in df.groupby('location'):
        
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        
        gen_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_output = pd.concat([df_output,gen_df],ignore_index=True)
    return df_output

df = remove_outlier_sqft(df)

def bhk_outliers_remover(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk,bhk_df in location_df.groupby('BHK'):
            bhk_stats[bhk] = {
                'mean' : np.mean(bhk_df.price_per_sqft),
                'std' : np.std(bhk_df.price_per_sqft),
                'count' : bhk_df.shape[0]
            }
            
        for bhk,bhk_df in location_df.groupby('BHK'):
            stats = bhk_stats.get(bhk - 1)
            if stats and stats['count']> 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')

df = bhk_outliers_remover(df)

df.drop(columns=['size','price_per_sqft'],inplace=True)
df.to_csv('cleaned_data_new.csv')

x = df.drop(columns=['price'])
y = df['price']

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

X_train, X_test, Y_train, Y_test = train_test_split(x,y,test_size=0.2,random_state=0)

column_trans = make_column_transformer((OneHotEncoder(sparse=False),['location']),remainder='passthrough')
scaler = StandardScaler()
lr = LinearRegression(n)
pipe = make_pipeline(column_trans,scaler,lr)
pipe.fit(X_train,Y_train)

Y_pred = pipe.predict(X_test)
print(r2_score(Y_test,Y_pred))

lasso = Lasso()
pipe = make_pipeline(column_trans,scaler,lasso)
pipe.fit(X_train,Y_train)
Y_pred_lasso = pipe.predict(X_test)
print(r2_score(Y_test,Y_pred_lasso))

ridge = Ridge()
pipe = make_pipeline(column_trans,scaler,ridge)
pipe.fit(X_train,Y_train)
Y_pred_ridge = pipe.predict(X_test)
print(r2_score(Y_test,Y_pred_ridge))

import pickle
pickle.dump(pipe, open('RidgeModel_new.pkl','wb'))

0.7081413910424553
0.7037234693750838
0.7081554039249696
