# Aufgabe 3

### Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 500)

# a)

### Read data

In [3]:
employment = pd.read_excel("dataset/employment_08_09.xlsx")
print(employment.shape)

(5412, 21)


In [4]:
employment.head()

Unnamed: 0,age,race,earnwke,employed,unemployed,married,union,ne_states,so_states,ce_states,we_states,government,private,self,educ_lths,educ_hs,educ_somecol,educ_aa,educ_bac,educ_adv,female
0,53,1,,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0
1,39,1,,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1
2,41,1,500.0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1
3,27,1,520.0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0
4,29,3,615.0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0


### Variable overview

1. Target variables are `employed` and `unemployed`. They should be distinct and exhaustive. Check and leave one out for later analysis.

2. The variable `race` encodes three possible states. Transform to dummies and leave one out for later analysis.

3. The variables `ne_states`, `so_states`, `ce_states` and `we_states` should be distinct and exhaustive. Check and leave one out for later analysis.

4. The variables `private`, `government` and `self` should be distinct and exhaustive. Check and leave one out for later analysis.

5. The variables `educ_lths`, `educ_hs`, `educ_somecol`, `educ_aa`, `educ_ba` and `educ_adv` should be distinct and exhaustive. Check and leave one out for later analysis.

6. The variable `earnwke` is probably not available for self-employed people. Check later and handle possible missing values.

### 1. Sanity check target variables

In [5]:
fig = px.histogram(employment, x="employed", color="unemployed", barmode="group")
fig.show()

One can see, that there are 435 data points which contain no information about wether or not the person was employed in 2009.

These points will be removed from the data.

In [6]:
unknown_employment_status = (employment["employed"] == employment["unemployed"])
unknown_employment_status[unknown_employment_status == True]

employment = employment[~unknown_employment_status]
employment.drop(columns="unemployed", inplace=True)
employment.head()

Unnamed: 0,age,race,earnwke,employed,married,union,ne_states,so_states,ce_states,we_states,government,private,self,educ_lths,educ_hs,educ_somecol,educ_aa,educ_bac,educ_adv,female
0,53,1,,1,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0
1,39,1,,1,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1
2,41,1,500.0,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1
3,27,1,520.0,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0
4,29,3,615.0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0


### Check distribution of target variable

In [7]:
target_group = employment["employed"].value_counts()
print("Amount of employment vs. unemployment in data: {n_employed} vs. {n_unemployed}".format(n_employed=target_group[1], n_unemployed=target_group[0]))

Amount of employment vs. unemployment in data: 4738 vs. 239


One can see that unemployment is pretty undersampled in the training data. What to do? One could oversample it by bootstrapping or undersample employment by leaving out training data (good idea?)

### 2. Dummy transform `race` column

### Check if missing values exist per feature

In [None]:
employment.isna().any(axis=0)

### Remove rows that contain missing values

In [None]:
employment.dropna(inplace=True)
employment.head()

### Check for variables that do not contain any information (no information means in this case: value does not vary)

In [None]:
irrelevant_vars = set()
for col in employment:
    if col in ["age", "race", "earnwke", "employed"]:
        continue
    n_values = employment[col].value_counts()
    if len(n_values) == 1:
        print("variable '{var}' does not contain any information.".format(var=col))
        irrelevant_vars.add(col)

In [None]:
employment.drop(columns=[col for col in list(irrelevant_vars)], inplace=True)

### Define input and target variables

In [None]:
X = employment.drop(columns="employed")
y = employment["employed"]

### Get train/test split

In [None]:
input_train, input_test, label_train, label_test = train_test_split(X, y, train_size=0.8, shuffle=True, stratify=y)

## Possible model classes

The target variable is discrete (more specifically binary) and that's why we are doing classification.

Possible classification algorithms are:
- k-Nearest-Neighbour
- Logistic Regression
- Support Vector Machine
- Decision Tree
- Neural Network

## Linear Regression

In [None]:
model = linear_model.LinearRegression()
model.fit(input_train, label_train)

In [None]:
plt.rcParams["figure.figsize"] = (11, 9)

y_hat = model.predict(input_test)

print("accuracy: ", accuracy_score(label_test, y_hat.round()))

cm = confusion_matrix(label_test, y_hat.round())
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.show()

# b)

### Read data

In [None]:
growth = pd.read_excel("dataset/Growth.xlsx")
growth.head()

### Get familiar with data

In [None]:
growth.sort_values(by="assasinations", ascending=False)

In [None]:
growth.sort_values(by="yearsschool", ascending=False)

In [None]:
growth[growth["country_name"] == "Germany"]

### Remove feature `oil` because it contains no information

In [None]:
growth.drop(columns="oil", inplace=True)
growth.head()

### Choose target variable and regressors

In [None]:
regression_data = growth.drop(columns="country_name")
y = regression_data["growth"]
X = regression_data.drop(columns="growth")
print(y)
print(X)

In [None]:
fig = px.scatter_matrix(regression_data, width=1024, height=720)
fig.show()

### Create training and test-split

In [None]:
input_train, input_test, target_train, target_test = train_test_split(X, y, shuffle=True)

### Baseline

In [None]:
avg_growth = target_train.mean()
print(avg_growth)

In [None]:
y_hat = np.array([avg_growth for _ in range(len(target_test))])
print("MSE: ", mean_squared_error(target_test, y_hat))

### Linear Regression

In [None]:
model = linear_model.LinearRegression()
model.fit(input_train, target_train)

In [None]:
y_hat = model.predict(input_test)

print("MSE: ", mean_squared_error(target_test, y_hat))
print("R2: ", r2_score(target_test, y_hat))

# c)

### Read data

In [None]:
crabs = pd.read_csv("dataset/crabs.txt", header=None, delim_whitespace=True, names=["id", "color", "spine", "width", "weight", "satellites"], index_col=0)

In [None]:
crabs.head()

### Get familiar with data

In [None]:
crabs.sort_values(by="satellites", ascending=False)

Target variable is `satellites`.

### Transform data

In [None]:
crabs = pd.get_dummies(crabs, columns=["color", "spine"], drop_first=True)
crabs.head()

In [None]:
fig = px.histogram(crabs["satellites"])
fig.show()

# d)

In [None]:
gasoline = pd.read_csv("dataset/gasoline.csv", sep=";", index_col=0)

In [None]:
gasoline.head()