## Importing libraries

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

## Reading files through pandas

In [None]:
housing_df1 = pd.read_csv('train.csv')
housing_df2 = pd.read_csv('test.csv')
housing_df3 = pd.read_csv('sample_submission.csv')

## Diff. methods to see the data details 

In [None]:
housing_df1.head()
housing_df1.tail()
housing_df1.notnull()
housing_df1.nunique()
housing_df1.shape
housing_df1.columns
housing_df1.isnull().sum()
housing_df1.describe()
housing_df1.duplicated().sum()   

In [None]:
housing_df2.head()
housing_df2.tail()
housing_df2.notnull()
housing_df2.nunique()
housing_df2.shape
housing_df2.columns
housing_df2.isnull().sum()
housing_df2.describe()
housing_df2.duplicated().sum()   

In [None]:
housing_df1['POSTED_BY'].info()  # getting info of any particular column

## Dropping the unnecessary columns

In [None]:
housing_df1 = housing_df1.drop('ADDRESS', axis=1)
housing_df1 = housing_df1.drop ('POSTED_BY', axis=1)       
'''To drop any column which is not of use in the tranning or evaluation of the model'''


In [None]:
housing_df2 = housing_df2.drop('ADDRESS', axis=1)
housing_df2 = housing_df2.drop('POSTED_BY', axis=1)

## Spliting the CITY column from  ADDRESS column

In [None]:
housing_df1['CITY'] = housing_df1['ADDRESS'].str.split(',').str[-1]
housing_df2['CITY'] = housing_df2['POSTED_BY'].str.split(',').str[-1]

In [None]:
warnings.filterwarnings('ignore')

## Plotting graphs 

In [None]:
for i in housing_df1:
    plt.figure(figsize=(14,6))
    sns.boxplot(housing_df1[i], data=housing_df1, palette='hls')
    plt.xticks(rotation = 90)
    plt.yticks(rotation = 90)
    plt.show()

## Finding corelations 

In [None]:
corr1 = housing_df1.corr()

In [None]:
corr2 = corr1['TARGET(PRICE_IN_LACS)'].sort_values(ascending=False)     # finding corelations with respect to target column
corr2

In [None]:
plt.figure(figsize=(14,6))
sns.heatmap(corr1, annot=True)          # plotting heatmap with the help of seaborn
plt.show()

## Accessing the column of DataFrame

In [None]:
X = housing_df1.loc[:, ('UNDER_CONSTRUCTION','RERA','BHK_NO.', 'BHK_OR_RK','SQUARE_FT','READY_TO_MOVE','RESALE','LONGITUDE','LATITUDE','CITY')].values
X

In [None]:
Y = housing_df1.iloc[:, -2].values
Y

In [None]:
test = housing_df2.loc[:, ('UNDER_CONSTRUCTION','RERA','BHK_NO.', 'BHK_OR_RK','SQUARE_FT','READY_TO_MOVE','RESALE','LONGITUDE','LATITUDE','CITY')].values
test


## Encode Column CITY

In [None]:
from sklearn.preprocessing import LabelEncoder
le =LabelEncoder()
X[:, 9] = le.fit_transform(X[:, 9])
X

## Encode column BHK_NO:

In [None]:
le = LabelEncoder()
X[:, 2] = le.fit_transform(X[:,2])
X

## Encode column BHK_OR_RK:

In [None]:
le = LabelEncoder()
X[:, 3] = le.fit_transform(X[:,3])
X

## Encoding bhk_no., city,bhk_or_rk FOR TEST DATA

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
test[:, 9] = le.fit_transform(test[:, 9])
test

In [None]:
test[:, 3] = le.fit_transform(test[:, 3])
test

## Split train and test data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators= 9,random_state=42)
regressor.fit(X_train,y_train)

## predict

In [None]:
Y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((Y_pred.reshape(len(Y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

## Evaluate the model

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error
print (np.sqrt(mean_squared_log_error(y_test, Y_pred)))
print (np.sqrt(mean_squared_error(y_test, Y_pred)))
print (r2_score(y_test, Y_pred))


## Predict

In [None]:
y_result = regressor.predict(test)
y_result

## Output predictions file 


In [None]:
pd.DataFrame({'TARGET(PRICE_IN_LACS)': y_result}).to_csv('sample_submission.csv', index = False)