# Laboratorium nr 4

## Wczytywanie danych

In [1]:
import csv
import pandas as pd


def read_csv_file_data(csv_file):
    lines = []

    with open(csv_file, newline='') as file:
        reader = csv.reader(file, delimiter=',')
        header = next(reader)

        for row in reader:
            lines.append(row)

    return pd.DataFrame(data=lines, columns=header)


CSV_FILE_NAME = './../../data/RainAustralia/weatherAUS.csv'
df_ra = read_csv_file_data(CSV_FILE_NAME)
print(df_ra.shape)

(142193, 24)


In [2]:
del CSV_FILE_NAME

## Usuwanie kolumn

In [3]:
def count_missing_values(df, missing_value='NA'):
    header = tuple(df.columns)
    counts = []

    for i in range(0, len(header)):
        column_name = header[i]
        selection = df[df[column_name] == missing_value]
        counts.append(selection.shape[0])

    return counts


def print_missing_values_counts(df, missing):
    header = list(df.columns)

    for i in range(0, len(header)):
        print('{h:23}{v}'.format(h=header[i], v=missing[i]))


missing_counts = count_missing_values(df_ra)
print_missing_values_counts(df_ra, missing_counts)

Date                   0
Location               0
MinTemp                637
MaxTemp                322
Rainfall               1406
Evaporation            60843
Sunshine               67816
WindGustDir            9330
WindGustSpeed          9270
WindDir9am             10013
WindDir3pm             3778
WindSpeed9am           1348
WindSpeed3pm           2630
Humidity9am            1774
Humidity3pm            3610
Pressure9am            14014
Pressure3pm            13981
Cloud9am               53657
Cloud3pm               57094
Temp9am                904
Temp3pm                2726
RainToday              1406
RISK_MM                0
RainTomorrow           0


In [4]:
def count_missing_percent(missing, total):
    percents = []

    for i in range(0, len(missing)):
        percentage = float((missing[i] / total) * 100)
        percents.append(percentage)

    return percents


missing_percent = count_missing_percent(missing_counts, df_ra.shape[0])
print_missing_values_counts(df_ra, missing_percent)

Date                   0.0
Location               0.0
MinTemp                0.44798267143952236
MaxTemp                0.2264527789694289
Rainfall               0.9887969168665124
Evaporation            42.78902618272348
Sunshine               47.692924405561456
WindGustDir            6.561504434114197
WindGustSpeed          6.5193082641198945
WindDir9am             7.041837502549352
WindDir3pm             2.6569521706413113
WindSpeed9am           0.948007285872019
WindSpeed3pm           1.8495987847503041
Humidity9am            1.247600092831574
Humidity3pm            2.5388028946572616
Pressure9am            9.85561877166949
Pressure3pm            9.832410878172624
Cloud9am               37.73533155640573
Cloud3pm               40.15246882757942
Temp9am                0.635755627914173
Temp3pm                1.9171126567411898
RainToday              0.9887969168665124
RISK_MM                0.0
RainTomorrow           0.0


In [5]:
def list_of_labels_to_be_deleted(df, percents, greater_than=30):
    header = tuple(df.columns)
    labels = []

    for i in range(0, len(percents)):
        if percents[i] > greater_than:
            labels.append(header[i])

    return labels


invalid_columns = list_of_labels_to_be_deleted(df_ra, missing_percent)
invalid_columns.append('RISK_MM')
print(invalid_columns)

['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm', 'RISK_MM']


In [6]:
def delete_invalid_columns(df, invalids):
    df.drop(invalids, axis=1, inplace=True)


delete_invalid_columns(df_ra, invalid_columns)
print(df_ra.shape)

(142193, 19)


In [7]:
del invalid_columns
del missing_counts
del missing_percent

## Imputacja danych

In [8]:
columns_to_skip = ['Date', 'Location']
columns_categorical_str = ['WindGustDir', 'WindDir9am', 'WindDir3pm']
columns_categorical_bool = ['RainToday', 'RainTomorrow']
columns_numerical_float = ['MinTemp', 'MaxTemp', 'Rainfall', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']
columns_numerical_int = ['WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm']

In [9]:
def impute_numerical_values(df, numerical, data_type, missing_value='NA'):
    dt_num = []

    for i in range(0, len(numerical)):
        header = numerical[i]
        column = df[df[header] != missing_value][header].astype(data_type)
        median = column.median()
        column = df[header].replace(missing_value, median).astype(data_type)
        dt_num.append(column)

    return dt_num


dt_int = impute_numerical_values(df_ra, columns_numerical_int, int)
dt_float = impute_numerical_values(df_ra, columns_numerical_float, float)
print(len(dt_int))
print(len(dt_float))

5
7


In [10]:
import scipy.stats as scs


def replace_bool_as_int(_column, _data_type):
    if _data_type == int:
        _column = _column.replace('No', 0)
        _column = _column.replace('Yes', 1)

    return _column


def impute_categorical_values(df, categorical, data_type, missing_value='NA'):
    _dt_cat = []

    for i in range(0, len(categorical)):
        header = categorical[i]
        column = df[df[header] != missing_value][header]
        column = replace_bool_as_int(column, data_type)
        column = column.astype(data_type)
        dominant = scs.mode(column)[0][0]
        column = df[header].replace(missing_value, dominant)
        column = replace_bool_as_int(column, data_type)
        column = column.astype(data_type)
        _dt_cat.append(column)

    return _dt_cat


dt_cat_int = impute_categorical_values(df_ra, columns_categorical_bool, int)
dt_cat_str = impute_categorical_values(df_ra, columns_categorical_str, str)
print(len(dt_cat_int))
print(len(dt_cat_str))

2
3


In [11]:
def add_formed_columns(df, column_names, column_series):
    for i in range(0, len(column_names)):
        df[column_names[i]] = column_series[i]


def shape_new_data_frame(df):
    headers = set(list(df.columns))
    to_delete = headers.difference(columns_to_skip)
    df.drop(to_delete, axis=1, inplace=True)
    add_formed_columns(df, columns_categorical_str, dt_cat_str)
    add_formed_columns(df, columns_categorical_bool, dt_cat_int)
    add_formed_columns(df, columns_numerical_int, dt_int)
    add_formed_columns(df, columns_numerical_float, dt_float)


shape_new_data_frame(df_ra)
print(df_ra.dtypes)

Date              object
Location          object
WindGustDir       object
WindDir9am        object
WindDir3pm        object
RainToday          int64
RainTomorrow       int64
WindGustSpeed      int64
WindSpeed9am       int64
WindSpeed3pm       int64
Humidity9am        int64
Humidity3pm        int64
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Pressure9am      float64
Pressure3pm      float64
Temp9am          float64
Temp3pm          float64
dtype: object


In [12]:
numerical_columns = columns_numerical_int
numerical_columns.extend(columns_numerical_float)
categorical_columns = columns_categorical_str
categorical_columns.extend(columns_categorical_bool)
categorical_columns.append('Location')
print(len(numerical_columns))
print(len(categorical_columns))

12
6


In [13]:
del columns_to_skip
del columns_categorical_bool
del columns_categorical_str
del columns_numerical_int
del columns_numerical_float
del dt_cat_str
del dt_cat_int
del dt_float
del dt_int

## Obserwacje odstające

In [14]:
import numpy as np


def cut_outliers(df, numerical):
    altered = 0

    for i in range(0, len(numerical)):
        header = numerical[i]
        column_series = df[header]
        q3, q1 = np.percentile(column_series, [75, 25])
        iqr = q3 - q1
        interval = 1.5 * iqr
        r_outlier = q3 + interval
        l_outlier = q1 - interval

        for index, value in column_series.items():
            if value < l_outlier:
                df.loc[index, header] = l_outlier
                altered += 1

            if value > r_outlier:
                df.loc[index, header] = r_outlier
                altered += 1

    return altered

outliers_no = cut_outliers(df_ra, numerical_columns)
print(outliers_no)

46213


In [15]:
del outliers_no

## Normalizacja i kodowanie danych

In [16]:
def normalize_numerical(df, numerical):
    _df = df.copy()

    for i in range(0, len(numerical)):
        header = numerical[i]
        _df[header] = _df[header] / _df[header].max()

    return _df


df_normed = normalize_numerical(df_ra, numerical_columns)
print(df_normed.shape)

(142193, 19)


In [17]:
def encode_categorical(df, categorical):
    _df = df.copy()

    for i in range(0, len(categorical)):
        header = categorical[i]
        new_columns = pd.get_dummies(_df[header], prefix=header)
        _df = pd.concat((_df, new_columns), axis=1)

        if header != 'Location':
            _df = _df.drop(header, axis=1)

    return _df


df_new = encode_categorical(df_normed, categorical_columns)
print(df_new.shape)

(142193, 115)


In [18]:
import datetime


def convert_date_str_to_float(df):
    dates = []

    for index, value in df['Date'].items():
        date = datetime.datetime.fromisoformat(value)
        time = date.timestamp()
        dates.append(time)

    dates = pd.Series(dates).astype(int)
    df['Date'] = dates


convert_date_str_to_float(df_new)
print(df_new.shape)

(142193, 115)


In [19]:
df_new.head()

Unnamed: 0,Date,Location,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,MinTemp,MaxTemp,Rainfall,...,Location_Townsville,Location_Tuggeranong,Location_Uluru,Location_WaggaWagga,Location_Walpole,Location_Watsonia,Location_Williamtown,Location_Witchcliffe,Location_Wollongong,Location_Woomera
0,1228086000,Albury,0.642336,0.540541,0.592593,0.71,0.22,0.437908,0.524628,0.4,...,0,0,0,0,0,0,0,0,0,0
1,1228172400,Albury,0.642336,0.108108,0.54321,0.44,0.25,0.24183,0.575029,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1228258800,Albury,0.671533,0.513514,0.641975,0.38,0.3,0.421569,0.588774,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1228345200,Albury,0.350365,0.297297,0.222222,0.45,0.16,0.300654,0.641466,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1228431600,Albury,0.59854,0.189189,0.493827,0.82,0.33,0.571895,0.739977,0.666667,...,0,0,0,0,0,0,0,0,0,0


In [19]:
del categorical_columns
del numerical_columns
del df_ra
del df_normed

## Podział danych

In [20]:
import sklearn.model_selection as skm


def build_and_regionalize_data(df):
    columns = list(df.columns)
    columns.remove('Location')
    locations = tuple(set(df['Location']))
    target_columns = list(set(df.columns))
    target_columns = [x for x in target_columns if str(x).startswith('RainTomorrow')]
    regions_data = []

    for i in range(0, len(locations)):
        city = locations[i]
        selection = df[df['Location'] == city][columns]
        target = selection[target_columns]
        selection = selection.drop(target_columns, axis=1)
        xtr, xt, ytr, yt = skm.train_test_split(selection, target, stratify=target)
        regions_data.append([xtr, xt, ytr, yt])

    return regions_data, locations


data_regions, region_names = build_and_regionalize_data(df_new)
print(len(region_names))
print(region_names)

49
('BadgerysCreek', 'Adelaide', 'Richmond', 'NorfolkIsland', 'Cobar', 'Moree', 'Albury', 'Dartmoor', 'Katherine', 'Wollongong', 'AliceSprings', 'SalmonGums', 'Uluru', 'SydneyAirport', 'Cairns', 'PearceRAAF', 'Portland', 'GoldCoast', 'Sydney', 'Nuriootpa', 'Woomera', 'Perth', 'Sale', 'Mildura', 'MountGambier', 'Hobart', 'Nhil', 'Walpole', 'PerthAirport', 'Darwin', 'Canberra', 'CoffsHarbour', 'MountGinini', 'NorahHead', 'Tuggeranong', 'Witchcliffe', 'Penrith', 'MelbourneAirport', 'Watsonia', 'Albany', 'Melbourne', 'Ballarat', 'Brisbane', 'Williamtown', 'Bendigo', 'Newcastle', 'Launceston', 'Townsville', 'WaggaWagga')


## Klasyfikacja

In [21]:
import sklearn.linear_model as skl


def fit_by_regions(dt_reg):
    fits = []

    for i in range(0, len(dt_reg)):
        model = skl.LogisticRegression()
        x_train = dt_reg[i][0].values
        y_train = dt_reg[i][2]['RainTomorrow_0'].values
        model.fit(x_train, y_train)
        fits.append(model)

    return fits


regressors = fit_by_regions(data_regions)
print(len(regressors))

49


In [22]:
del df_new

## Testowy zbiór krajowy

In [23]:
import random as rnd


def build_country_test_set(dt_reg, samples=100):
    cardinal_set = []

    for i in range(0, len(dt_reg)):
        rows_no = dt_reg[i][1].shape[0]
        random_indices = [rnd.randint(0, rows_no - 1) for _ in range(0, samples)]
        x_test = dt_reg[i][1].iloc[random_indices, :]
        y_test = dt_reg[i][3].iloc[random_indices, :]
        cardinal_set.append([x_test, y_test])

    x = cardinal_set[0][0]
    y = cardinal_set[0][1]

    for i in range(1, len(cardinal_set)):
        x = x.append(cardinal_set[i][0], ignore_index=True)
        y = y.append(cardinal_set[i][1], ignore_index=True)

    return [x, y]


country_test_cardinal = build_country_test_set(data_regions)
print(country_test_cardinal[0].shape)
print(country_test_cardinal[1].shape)

(4900, 112)
(4900, 2)


### Metryki porównawcze

In [35]:
import sklearn.metrics as skt


def country_accuracy(test_c, regs):
    metrics = []

    for i in range(0, len(regs)):
        x_test_c = test_c[0].values
        y_test_c = test_c[1]['RainTomorrow_0'].values
        y_pred = regs[i].predict(x_test_c)
        metric = skt.accuracy_score(y_test_c, y_pred)
        metrics.append(metric)

    return metrics


country_accuracies = country_accuracy(country_test_cardinal, regressors)
country_reg_best = np.argmax(country_accuracies)
print(country_reg_best)
print(country_accuracies[country_reg_best] * 100, '%')
print(region_names[country_reg_best])

0
76.59183673469387 %
BadgerysCreek


In [36]:
def regions_accuracy(dt_reg, regs):
    metrics = []

    for i in range(0, len(dt_reg)):
        x_test_r = dt_reg[i][1].values
        y_test_r = dt_reg[i][3]['RainTomorrow_0'].values
        y_pred = regs[i].predict(x_test_r)
        metric = skt.accuracy_score(y_test_r, y_pred)
        metrics.append(metric)

    return metrics


regions_accuracy = regions_accuracy(data_regions, regressors)
regions_reg_best = np.argmax(regions_accuracy)
print(regions_reg_best)
print(regions_accuracy[regions_reg_best] * 100, '%')
print(region_names[regions_reg_best])
print(regions_reg_best == country_reg_best)

20
93.18181818181817 %
Woomera
False


- skuteczność dla całego kraju bez identyczna dla każdego regresora i można uznać, iż jest to 75 %
- skuteczność dla poszczególnych regionów waha się, lecz nadal wynosi co najmniej 63 %
- najprawdopodobniej przewidywanie dla całego kraju jest nadmiernie dopasowane do danych

In [37]:
del country_accuracies
del regions_accuracy
del regions_reg_best

In [44]:
def region_confusion_matrix_1(dt_reg, regs, reg_idx):
    x_test_r = dt_reg[reg_idx][1].values
    y_test_r = dt_reg[reg_idx][3]['RainTomorrow_0'].values
    y_pred = regs[reg_idx].predict(x_test_r)
    cm = skt.confusion_matrix(y_test_r, y_pred)
    score = skt.accuracy_score(y_test_r, y_pred)
    return cm, score


def country_confusion_matrix_2(test_c, regs, reg_idx):
    x_test_c = test_c[0].values
    y_test_c = test_c[1]['RainTomorrow_0'].values
    y_pred = regs[reg_idx].predict(x_test_c)
    cm = skt.confusion_matrix(y_test_c, y_pred)
    score = skt.accuracy_score(y_test_c, y_pred)
    return cm, score


cm_region, s_region = region_confusion_matrix_1(data_regions, regressors, country_reg_best)
cm_country, s_country = country_confusion_matrix_2(country_test_cardinal, regressors, country_reg_best)
print(cm_region)
print()
print(cm_country)
print()
print(s_region * 100, '%')
print(s_country * 100, '%')

[[  0 146]
 [  0 586]]

[[   0 1147]
 [   0 3753]]

80.05464480874316 %
76.59183673469387 %


- ostatecznie użyteczność względem zbioru krajowego, jak i zregionalizowanego jest podobna (75 %)
- lepsze to niż nic, lecz nadal pozostawia to spory margines błędu

In [45]:
del cm_country
del cm_region
del country_reg_best
del country_test_cardinal
del data_regions
del region_names
del regressors
del s_country
del s_region

In [46]:
!jupyter nbconvert --to pdf lab4.ipynb

[NbConvertApp] Converting notebook lab4.ipynb to pdf
[NbConvertApp] Writing 71540 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', './notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', './notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 63795 bytes to lab4.pdf
