In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit 

In [2]:
housing = pd.read_csv("Data/housing.csv")
housing.tail()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND
20639,-121.24,39.37,16.0,2785.0,616.0,1387.0,530.0,2.3886,89400.0,INLAND


In [3]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


### Entfernen der NaNs

In [4]:
housing_woNaN = housing.dropna()

In [5]:
housing_woNaN.reset_index(drop=True, inplace=True)

In [6]:
housing_woNaN.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


### Aufteilung des Datensatzes in Train, Validate und Test
#### Attribut "median_income" wird verwendet, um Kategorien zu erzeugen. Die Daten werden dann repräsentativ nach der Kategorie aufgeteilt.

In [7]:
housing_woNaN["income_cat"] = np.ceil(housing["median_income"]/1.5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [8]:
housing_woNaN["income_cat"].where(housing_woNaN["income_cat"]<5, 5.0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [12]:
stratsplit = StratifiedShuffleSplit(n_splits = 1, test_size = 0.3, random_state = 42)
for train_index, validate_index in stratsplit.split(housing_woNaN, housing_woNaN["income_cat"]):
    strat_train_set = housing_woNaN.loc[train_index]
    strat_test_validate_set = housing_woNaN.loc[validate_index]

In [13]:
strat_test_validate_set.reset_index(drop=True, inplace = True)

In [14]:
stratsplit = StratifiedShuffleSplit(n_splits = 1, test_size = 0.5, random_state = 42)
for validate_index, test_index in stratsplit.split(strat_test_validate_set, strat_test_validate_set["income_cat"]):
    strat_validate_set = strat_test_validate_set.loc[validate_index]
    strat_test_set = strat_test_validate_set.loc[test_index]

#### Überprüfen, ob gleichmäßig/repräsentativ aufgeteilt wurde

In [15]:
strat_train_set["income_cat"].value_counts()/len(strat_train_set)

3.0    0.351744
2.0    0.318395
4.0    0.175697
5.0    0.114591
1.0    0.039572
Name: income_cat, dtype: float64

In [16]:
strat_validate_set["income_cat"].value_counts()/len(strat_validate_set)

3.0    0.351713
2.0    0.318434
4.0    0.175856
5.0    0.114519
1.0    0.039478
Name: income_cat, dtype: float64

In [17]:
strat_test_set["income_cat"].value_counts()/len(strat_test_set)

3.0    0.351713
2.0    0.318434
4.0    0.175530
5.0    0.114519
1.0    0.039804
Name: income_cat, dtype: float64

In [18]:
strat_train_set.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,income_cat
count,14303.0,14303.0,14303.0,14303.0,14303.0,14303.0,14303.0,14303.0,14303.0,14303.0
mean,-119.576913,35.637777,28.623016,2633.192827,537.272111,1420.228064,498.451304,3.86717,206742.01685,3.007341
std,2.004551,2.132878,12.643956,2172.463098,419.398449,1111.598476,377.965408,1.903412,115476.289845,1.053931
min,-124.3,32.54,1.0,2.0,2.0,6.0,2.0,0.4999,14999.0,1.0
25%,-121.805,33.94,18.0,1446.0,295.0,785.0,279.0,2.56655,119400.0,2.0
50%,-118.51,34.26,29.0,2121.0,434.0,1166.0,409.0,3.5364,179800.0,3.0
75%,-118.02,37.72,37.0,3146.0,647.0,1720.0,603.0,4.73275,264550.0,4.0
max,-114.31,41.95,52.0,39320.0,6210.0,35682.0,5358.0,15.0001,500001.0,5.0


In [19]:
strat_validate_set.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,income_cat
count,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0
mean,-119.552819,35.608506,28.800326,2633.460033,537.641109,1437.706036,500.641762,3.885063,207075.544535,3.007504
std,2.005584,2.131222,12.45768,2207.371827,434.846277,1231.926008,401.402293,1.881792,115783.477531,1.053842
min,-124.22,32.56,2.0,8.0,1.0,5.0,1.0,0.4999,14999.0,1.0
25%,-121.75,33.92,18.0,1444.0,298.0,796.0,279.0,2.5574,120700.0,2.0
50%,-118.48,34.24,29.0,2116.0,433.0,1165.0,408.0,3.5625,178700.0,3.0
75%,-117.99,37.68,37.0,3114.0,644.0,1716.0,601.0,4.7891,262000.0,4.0
max,-114.47,41.95,52.0,32627.0,6445.0,28566.0,6082.0,15.0001,500001.0,5.0


In [20]:
strat_test_set.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,income_cat
count,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0
mean,-119.559514,35.636679,28.512887,2655.001305,540.892659,1434.208809,502.808483,3.875889,207224.451223,3.006525
std,1.997518,2.158068,12.483223,2222.953966,417.062526,1130.310183,382.934957,1.897996,114933.205895,1.054313
min,-124.35,32.56,2.0,12.0,4.0,3.0,2.0,0.4999,22500.0,1.0
25%,-121.78,33.93,18.0,1467.0,300.0,793.0,284.0,2.5556,119000.0,2.0
50%,-118.47,34.24,29.0,2172.0,439.0,1168.0,415.0,3.5192,181300.0,3.0
75%,-118.0,37.72,37.0,3151.0,649.0,1747.0,610.0,4.7813,267400.0,4.0
max,-114.57,41.81,52.0,37937.0,5471.0,16122.0,5189.0,15.0001,500001.0,5.0


### Factorizen und OneHotEncoden von kategorischen Merkmalen

In [26]:
train_cat = strat_train_set["ocean_proximity"]
validate_cat = strat_validate_set["ocean_proximity"]
test_cat = strat_test_set["ocean_proximity"]

In [30]:
type(train_cat)

pandas.core.series.Series

In [27]:
# Definition einer Funktion, um kategorische Merkmale zu factorizen
def factorize(df, column):
    df = df[column]
    df_factorized, df_categories = df.factorize()
    return df_factorized, df_categories

In [32]:
factorize(strat_train_set, "ocean_proximity")

(array([0, 0, 0, ..., 0, 1, 2], dtype=int64),
 Index(['INLAND', '<1H OCEAN', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND'], dtype='object'))