# Load the train and the test data from directory:

After loading the data, we want to asssign the variables.

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import tree, metrics
from sklearn.metrics import mean_squared_error, accuracy_score
import matplotlib.pyplot as plt

from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

#### Choose a directory:

In [2]:
train = pd.read_csv('../data_frames/wirtschaft/train_data', index_col=None, header=0)
test = pd.read_csv('../data_frames/wirtschaft/test_data', index_col=None, header=0)

---

In [3]:
hot_train_data = train.copy()
hot_train_data[hot_train_data > 0] = 1

In [4]:
hot_test_data = test.copy()
hot_test_data[hot_test_data > 0] = 1

In [5]:
y_train = hot_train_data['goal_val']
y_test = hot_test_data['goal_val']
X_train = hot_train_data.drop(['goal_val'], axis=1)
X_test = hot_test_data.drop(['goal_val'], axis=1)

---

# Test a model and calculate the errors: StochasticGradientDescent

In [6]:
sgd = linear_model.SGDClassifier(max_iter=500, tol=1e-3)
sgd.fit(X_train, y_train)  

prediction_test = sgd.predict(X_test)
prediction_train = sgd.predict(X_train)

error_test = mean_squared_error(y_test, prediction_test)
error_train = mean_squared_error(y_train, prediction_train)
    
print(f'The TEST prediction is {(1 - error_test)*100} % ACCURATE')

print(f'The TRAIN prediction is {(1 - error_train)*100} % ACCURATE')

The TEST prediction is 77.82515991471215 % ACCURATE
The TRAIN prediction is 91.10915492957746 % ACCURATE


---

# Test a model and calculate the errors: RandomForestClassifier

In [7]:
rfc = RandomForestClassifier(n_estimators= 50, max_depth = 15, bootstrap = False, max_leaf_nodes=256)
rfc.fit(X_train, y_train)

prediction_test = rfc.predict(X_test)
prediction_train = rfc.predict(X_train)

error_test = mean_squared_error(y_test, prediction_test)
error_train = mean_squared_error(y_train, prediction_train)
    
print(f'The TEST prediction is {(1 - error_test)*100} % ACCURATE')

print(f'The TRAIN prediction is {(1 - error_train)*100} % ACCURATE')

The TEST prediction is 91.25799573560768 % ACCURATE
The TRAIN prediction is 93.39788732394366 % ACCURATE


---
# Test a model and calculate the errors: LogisticRegression

In [8]:
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial')
clf.fit(X_train, y_train)

prediction_test = clf.predict(X_test)
prediction_train = clf.predict(X_train)

error_test = mean_squared_error(y_test, prediction_test)
error_train = mean_squared_error(y_train, prediction_train)
    
print(f'The TEST prediction is {(1 - error_test)*100} % ACCURATE')

print(f'The TRAIN prediction is {(1 - error_train)*100} % ACCURATE')

The TEST prediction is 91.25799573560768 % ACCURATE
The TRAIN prediction is 93.22183098591549 % ACCURATE


---
# Test a model and calculate the errors: DecisionTreeClassifier

In [9]:
dtc = DecisionTreeClassifier(max_depth = 16, min_samples_split = 6)
dtc.fit(X_train, y_train)

prediction_test = dtc.predict(X_test)
prediction_train = dtc.predict(X_train)

error_test = mean_squared_error(y_test, prediction_test)
error_train = mean_squared_error(y_train, prediction_train)
    
print(f'The TEST prediction is {(1 - error_test)*100} % ACCURATE')

print(f'The TRAIN prediction is {(1 - error_train)*100} % ACCURATE')

The TEST prediction is 90.8315565031983 % ACCURATE
The TRAIN prediction is 93.13380281690141 % ACCURATE


---
# Test a model and calculate the errors: KNN

In [10]:
knn = neighbors.KNeighborsRegressor(8, weights='distance')
knn.fit(X_train, y_train)

prediction_test = knn.predict(X_test)
prediction_train = knn.predict(X_train)

error_test = mean_squared_error(y_test, prediction_test)
error_train = mean_squared_error(y_train, prediction_train)
    
print(f'The TEST prediction is {(1 - error_test)*100} % ACCURATE')

print(f'The TRAIN prediction is {(1 - error_train)*100} % ACCURATE')

#error_test = []
#error_train = []
#
#for i in range(1, 50):
#    neighbor = i
#
#    knn = neighbors.KNeighborsRegressor(neighbor, weights='distance')
#    model = knn.fit(X_train, y_train)
#    y_pred = model.predict(X_test)
#    y_train_pred = model.predict(X_train)
#    error_test.append(mean_squared_error(y_test, y_pred))
#    error_train.append(mean_squared_error(y_train, y_train_pred))

The TEST prediction is 91.44598563357683 % ACCURATE
The TRAIN prediction is 96.75469483568075 % ACCURATE


---