# Exercise 1
Author: Tobias Famos

# Importing the needed Packages

In [366]:
import pandas
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from mlxtend.evaluate import accuracy_score
from mlxtend.classifier import OneRClassifier

# Task 1
First lets define a function to evaluate prediction rules. Prediction rules will be give and a lambda function doing it's decision based only on a single row.

## Loading the Data

In [367]:
df = pandas.read_csv("Data/titanic.csv")
df.head(10)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cumings,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05
5,0,3,Mr. James Moran,male,27.0,0,0,8.4583
6,0,1,Mr. Timothy J McCarthy,male,54.0,0,0,51.8625
7,0,3,Master. Gosta Leonard Palsson,male,2.0,3,1,21.075
8,1,3,Mrs. Oscar W (Elisabeth Vilhelmina Berg) Johnson,female,27.0,0,2,11.1333
9,1,2,Mrs. Nicholas (Adele Achem) Nasser,female,14.0,1,0,30.0708


In [368]:
def is_correct_prediction(row, prediction_name):
    if row['Survived'] == row[prediction_name]:
        return True
    else:
        return False

def predict_and_evaluate(func):
    df['prediction'] = df.apply (lambda row: func(row), axis=1)
    correct_predictions = df.apply(lambda row: is_correct_prediction(row, 'prediction'), axis=1)
    accuracy =  correct_predictions.value_counts()[True] / correct_predictions.size
    print(f"Accuracy for {func.__name__}: {accuracy}")
    return accuracy


## Task 1.a) What is the best default rule for this dataset?
There are two possible rules without any prior knowledge of the person:
- survived = yes
- survived = no;
Let's count the values and see which brings the higher accuracy.

In [369]:
def label_all_died(row):
    return 0

def label_all_survived(row):
    return 1

predict_and_evaluate(label_all_died)
predict_and_evaluate(label_all_survived)



Accuracy for label_all_died: 0.6144306651634723
Accuracy for label_all_survived: 0.3855693348365276


0.3855693348365276

As one can see from the output above, the best accuracy is when we use the default rule survived = 0.

## Task 1.b)
What is the best 1R for this dataset?
Lets first import our OneRClassifier

Define a function to evaluate a OneR based on one attribute

In [370]:
def create_and_evaluate_1R_for_attribute(attribute):
    x_d = df[[attribute]]
    y = df["Survived"]
    x_d_train, x_d_test, y_train, y_test = train_test_split(x_d, y, test_size=0.2)
    one_r = OneRClassifier()
    one_r.fit(x_d_train.to_numpy(), y_train)
    y_pred = one_r.predict(x_d_test.to_numpy())
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for 1R based on attribute {attribute}: {accuracy}')

Evaluate all the possible Attributes

In [371]:
create_and_evaluate_1R_for_attribute('Pclass')
create_and_evaluate_1R_for_attribute('Sex')
create_and_evaluate_1R_for_attribute('Age')
create_and_evaluate_1R_for_attribute('Siblings/Spouses Aboard')
create_and_evaluate_1R_for_attribute('Parents/Children Aboard')
create_and_evaluate_1R_for_attribute('Fare')


Accuracy for 1R based on attribute Pclass: 0.6910112359550562
Accuracy for 1R based on attribute Sex: 0.8033707865168539
Accuracy for 1R based on attribute Age: 0.5842696629213483
Accuracy for 1R based on attribute Siblings/Spouses Aboard: 0.5898876404494382
Accuracy for 1R based on attribute Parents/Children Aboard: 0.5955056179775281
Accuracy for 1R based on attribute Fare: 0.6573033707865169


So the best 1R Rule for this datasets seems to be based on sex. Let's write the predictor just for fun and see the error throughout the whole dataset.

In [372]:
create_and_evaluate_1R_for_attribute('Fare')


Accuracy for 1R based on attribute Fare: 0.6910112359550562


In [373]:
def one_rule_sex(row):
    if row['Sex'] == 'female':
        return 1
    return 0
predict_and_evaluate(one_rule_sex)

Accuracy for one_rule_sex: 0.7857948139797069


0.7857948139797069

## Task 1.c) Can you produce a second rule based on a single attribute with a good effectiveness?

Sure, we will just take the second highest ranking rule above.

In [374]:
create_and_evaluate_1R_for_attribute('Pclass')
create_and_evaluate_1R_for_attribute('Sex')
create_and_evaluate_1R_for_attribute('Age')
create_and_evaluate_1R_for_attribute('Siblings/Spouses Aboard')
create_and_evaluate_1R_for_attribute('Parents/Children Aboard')
create_and_evaluate_1R_for_attribute('Fare')

Accuracy for 1R based on attribute Pclass: 0.702247191011236
Accuracy for 1R based on attribute Sex: 0.7752808988764045
Accuracy for 1R based on attribute Age: 0.5955056179775281
Accuracy for 1R based on attribute Siblings/Spouses Aboard: 0.6067415730337079
Accuracy for 1R based on attribute Parents/Children Aboard: 0.6067415730337079
Accuracy for 1R based on attribute Fare: 0.6685393258426966



Depending on the Test / Train split we run here, this is either the second best attribute to pick is either Fare or Class. As the Fare is just dependent on the class (First class being more expensive than third, we will just pick the class as the second rule).
Once again, I write the rule function and evaluate it over the whole dataset.

In [375]:
def one_rule_class(row):
    if row['Pclass'] == 1:
        return 1
    return 0

predict_and_evaluate(one_rule_class)

Accuracy for one_rule_class: 0.677564825253664


0.677564825253664

# Task 2
Using your select stock / market index and your decision criterion (binary or ternary) on the daily return of the next day or on the trend (daily return after 5/10 days), can you generate a 1R model using as possible predictor the volume, and the moving average (with a period of 5, 10, 20 50 or 200). You can learn on all days except the last 100 (that will be used as the test set).

Let's clarify the task first:
- Selected market index: SMI
- decision criterion: Binary (Up or down)
- timeframe: daily.

In [376]:
smi = pandas.read_csv("Data/SMI.csv")
smi.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,7750.0,7750.0,7750.0,7750.0,7750.0,7750.0
mean,6467.866175,6506.270381,6427.279904,6467.621751,6467.621751,36463130.0
std,2495.146649,2505.374595,2483.327749,2494.47988,2494.47988,36422500.0
min,1288.699951,1296.599976,1279.0,1287.599976,1287.599976,0.0
25%,5118.052368,5168.074829,5068.750122,5123.877563,5123.877563,0.0
50%,6683.669922,6721.63501,6644.150146,6681.550049,6681.550049,39747000.0
75%,8334.000244,8383.030273,8282.442871,8329.642334,8329.642334,57668980.0
max,12505.969727,12573.429688,12456.919922,12545.349609,12545.349609,346767700.0


In [377]:
smi.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1990-11-09,1378.900024,1389.0,1375.300049,1387.099976,1387.099976,0
1,1990-11-12,1388.099976,1408.099976,1388.099976,1407.5,1407.5,0
2,1990-11-13,1412.199951,1429.400024,1411.400024,1415.199951,1415.199951,0
3,1990-11-14,1413.599976,1413.599976,1402.099976,1410.300049,1410.300049,0
4,1990-11-15,1410.599976,1416.699951,1405.099976,1405.699951,1405.699951,0
5,1990-11-16,1405.699951,1407.400024,1389.400024,1395.199951,1395.199951,0
6,1990-11-19,1395.599976,1417.900024,1395.599976,1416.0,1416.0,0
7,1990-11-20,1414.800049,1415.0,1404.699951,1405.800049,1405.800049,0
8,1990-11-21,1405.599976,1405.599976,1396.699951,1398.400024,1398.400024,0
9,1990-11-22,1400.0,1401.400024,1384.5,1388.800049,1388.800049,0


From the Dataset Description we can see, that there seem to be a few missing values in the volume column. This must be kept in mind when moving forward.

Adding the daily return as a column in the dataset. We derive it using the closing price.
$r_{t} = \frac{p_{t} - p_{t-1}}{p_{t-1}}$


In [378]:
for index_smi in range(1,len(smi)):
    p_t = smi.loc[index_smi, 'Close']
    p_t_minus_1 = smi.loc[index_smi-1, 'Close']
    daily_return = (p_t - p_t_minus_1)/p_t_minus_1
    smi.loc[index_smi, 'Daily Return'] = daily_return

In [379]:
for index_smi in range(2,len(smi)):
    smi.loc[index_smi, 'Yesterday Daily Return'] = smi.loc[index_smi-1, 'Daily Return']

Building the ground truth labels

In [380]:
def build_ground_truth(row):
    if row['Daily Return'] > 0:
        return 1
    return 0

smi['Is Going Up'] = smi.apply (lambda row: build_ground_truth(row), axis=1)

Build the same model as above but don't do arbitrary test / validation splits.

In [381]:
x = smi[['Yesterday Daily Return']]
le = preprocessing.LabelEncoder()
y = smi['Is Going Up']
y = le.fit_transform(y)

x_train = x[2:-100]
y_train = y[2:-100]
x_test = x[-100:]
y_test = y[-100:]

one_r_2 = OneRClassifier()
one_r_2.fit(x_train.to_numpy(), y_train)
y_pred = one_r_2.predict(x_test.to_numpy())
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy for 1R based on yesterday daily return: {accuracy}')


Accuracy for 1R based on yesterday daily return: 0.4


Now we have a weird problem here. My OneRClassifier predicts all the time 0 (going down). Althoug it has been trained on data not only including going down.
I was unable to solve this problem. So here is a naive approach predicting a change to continue once it started. (Taking yesterdays trend and predicting it to continue)

In [382]:
def predict_same_as_yesterday(row):
    if row['Yesterday Daily Return']>0:
        return 1
    return 0

def is_correct_prediction_smi(row, prediction_name):
    if row['Is Going Up'] == row[prediction_name]:
        return True
    else:
        return False

smi['Prediction'] = smi.apply (lambda row: predict_same_as_yesterday(row), axis=1)
correct_predictions = smi.apply(lambda row: is_correct_prediction_smi(row, 'Prediction'), axis=1)
accuracy =  correct_predictions.value_counts()[True] / correct_predictions.size
print(f"Accuracy {accuracy}")

Accuracy 0.5054193548387097


And Thats a pretty bad accuracy. Almost the same as a coin toss...