<a href="https://colab.research.google.com/github/sutanto5/DataScience-Exploration/blob/main/JS_MLScope_Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.datasets import fetch_california_housing

In [11]:
# Fetch the dataset and request a DataFrame format
california = fetch_california_housing(as_frame=True)

# Get the full DataFrame directly from the dataset
df = california.frame

In [17]:
df.head(1000) # view dataset

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
995,4.8624,11.0,5.680000,1.044706,5826.0,2.741647,37.71,-121.75,1.924
996,9.1531,25.0,5.811765,0.952941,254.0,2.988235,37.74,-121.77,4.188
997,4.7361,22.0,6.080220,1.036264,2474.0,2.718681,37.70,-121.80,2.168
998,5.4324,17.0,5.975831,0.965257,2222.0,3.356495,37.69,-121.80,2.155


In [30]:
df.columns # view columns
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [None]:
test = df.drop(columns=['Latitude',"Longitude","AveOccup"],inplace=False) # remove columns from your dataset, inplace=False returns a separate copy of the dataset while True modifies the dataset directly
test.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,3.422


# **Data Cleaning**

In [13]:
#checks for null operators -> Results in 0 so no null vals
df.isnull().sum()

df.dropna(how = 'all') # drop na -> no na so nothig dropped, all will oly remove if row is na

df.shape #size of data set, 20640 entries 9 rows

(20640, 9)

In [None]:
sns.heatmap(df.isnull(),ytick=True, annot=False)

# **Training Model**

We will be training our model to predict themedian house value of  based on the geographical trends of the area



In [19]:
#we will first be using a linear regression to test our data
reg = LinearRegression()

#got rid of med house val because thats what we want to predict
#got rid of laitude and longitude because it didn't seem like it varied that much
X = df.drop(columns = ['Latitude', 'Longitude', 'MedHouseVal'])
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467


In [22]:
#want to predict median house value
y = df['MedHouseVal']
y.shape

(20640,)

In [23]:
#most common to split data into 80/20 split
#random_state randomizes the split '11' is the seed I think
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=11, test_size=0.2)

In [32]:
print(x_train.shape)
print(y_train.shape)
x_train.head()
x_train.describe()

(16512, 6)
(16512,)


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,3.856323,28.732437,5.427024,1.097891,1426.363251,3.091189
std,1.898502,12.609133,2.597385,0.499879,1151.945583,11.463774
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308
25%,2.55435,18.0,4.429182,1.006304,785.0,2.42878
50%,3.5222,29.0,5.21229,1.04878,1165.0,2.819087
75%,4.7308,37.0,6.045098,1.09905,1719.0,3.287706
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333


In [33]:
# fit model
reg.fit(x_train,y_train)

In [34]:
reg.predict(x_test)

array([1.76096569, 2.44131378, 1.94398369, ..., 1.37963787, 3.1900693 ,
       1.99381092])

In [35]:
# test model against the test set to see how well the model predicts the y values
#accuracy is 54 percent
reg.score(x_test,y_test)

0.5435281737013931

In [None]:
# predict using your own data