# Libraries

In [14]:
#!pip install missingno

In [60]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#Main libraries
import pandas as pd
import numpy as np

#Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
% matplotlib inline

from itertools import combinations

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# Loading dataset...

In [16]:
df = pd.read_csv("/content/california_housing_train.csv")

In [None]:
df.head(10)

# EDA

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df.describe() #Check for outliers

In [None]:
fig=plt.figure(figsize=(20,12))
sns.boxplot(data=df.drop(columns=['median_house_value'],axis=1),orient="h",palette="Set2")
plt.show()
#So much outliers...or are they?

In [None]:
fig = plt.figure(figsize=(10,5))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
sns.jointplot(x='median_income', y= 'median_house_value', data=df)
plt.show()

# Combinations of features

In [38]:
columns_list = df.drop(columns=['median_house_value'], axis=1)
interactions = [com for sub in range(1,8) for com in combinations(columns_list, sub)]

In [None]:
interactions

#Linear Regression (with feature selection)

In [49]:
Y = df['median_house_value']
mse_scores=list()
for combination in interactions:
  X = df[list(combination)]
  x_train, x_test, y_train, y_test = train_test_split(X,Y, train_size=0.8, random_state=0)
  model = LinearRegression().fit(x_train, y_train)
  y_pred = model.predict(x_test)
  mse_scores.append(mean_squared_error(y_test,y_pred))

In [55]:
print("The minimum mse is: ", min(mse_scores))
print("With features: ")
print(interactions[mse_scores.index(min(mse_scores))])

The minimum mse is:  4739460612.860681
With features: 
('longitude', 'latitude', 'housing_median_age', 'total_bedrooms', 'population', 'households', 'median_income')


## Same, but with normalized data

In [68]:
Y = df['median_house_value']
mse_scores=list()

for combination in interactions:
  X = df[list(combination)]
  x_train, x_test, y_train, y_test = train_test_split(X,Y, train_size=0.80, random_state=0)
  scaler = MinMaxScaler().fit(x_train)
  x_train = scaler.transform(x_train)
  x_test = scaler.transform(x_test)
  model = LinearRegression().fit(x_train, y_train)
  y_pred = model.predict(x_test)
  mse_scores.append(mean_squared_error(y_test,y_pred))

In [69]:
print("The minimum mse is: ", min(mse_scores))
print("With features: ")
print(interactions[mse_scores.index(min(mse_scores))])

The minimum mse is:  4739460612.860679
With features: 
('longitude', 'latitude', 'housing_median_age', 'total_bedrooms', 'population', 'households', 'median_income')


# **Conclusions**

As we can see, Linear regression model + feature selection provides us the minimum mean squared error among other combinations of features. There is no difference even when the data is normalized with MinMaxScaler.

The biggest correlation with the target is median_income feature = 0.69 . 

We can't conclude that there are outliers in the above boxplots. 