## 1 Datenerhebung mittels API & Web Scraping

In [1]:
#import modules
import pandas as pd
import numpy as np
import seaborn as sns
import time
import datetime
#sns.get_dataset_names()
from pandas_datareader import data
from sklearn.model_selection import train_test_split


### 1.1 Yahoo Finance API: Aktienkurs

In [2]:
ticker = 'AAPL'
period1 = int(time.mktime(datetime.datetime(2010, 1, 1, 23, 59).timetuple()))
period2 = int(time.mktime(datetime.datetime(2022, 2, 1, 23, 59).timetuple()))
interval = '1d'
query_string = f'https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1={period1}&period2={period2}&interval={interval}&events=history&includeAdjustedClose=true'
data = pd.read_csv(query_string)
print(data)
data.to_csv('APPL Prices.csv')

            Date        Open        High         Low       Close   Adj Close  \
0     2010-01-04    7.622500    7.660714    7.585000    7.643214    6.505279   
1     2010-01-05    7.664286    7.699643    7.616071    7.656429    6.516527   
2     2010-01-06    7.656429    7.686786    7.526786    7.534643    6.412874   
3     2010-01-07    7.562500    7.571429    7.466071    7.520714    6.401018   
4     2010-01-08    7.510714    7.571429    7.466429    7.570714    6.443572   
...          ...         ...         ...         ...         ...         ...   
3037  2022-01-26  163.500000  164.389999  157.820007  159.690002  158.526489   
3038  2022-01-27  162.449997  163.839996  158.279999  159.220001  158.059921   
3039  2022-01-28  165.710007  170.350006  162.800003  170.330002  169.088974   
3040  2022-01-31  170.160004  175.000000  169.509995  174.779999  173.506546   
3041  2022-02-01  174.009995  174.839996  172.309998  174.610001  173.337784   

         Volume  
0     493729600  
1  

### 1.2 Web Scraping

In [3]:
import urllib.request,sys,time
from bs4 import BeautifulSoup
import requests
import pandas as pd

pagesToGet = 99
search_term = "Apple Inc." # Change this to the search term you want to use

upperframe = []

for page in range(1, pagesToGet+1):
    print('processing page :', page)
    url = 'https://www.politifact.com/factchecks/list/?page=' + str(page)
    print(url)
    try:
        page = requests.get(url)
    except Exception as e:
        error_type, error_obj, error_info = sys.exc_info()
        print ('ERROR FOR LINK:', url)
        print (error_type, 'Line:', error_info.tb_lineno)
        continue
    time.sleep(2)
    soup = BeautifulSoup(page.text, 'html.parser')
    frame = []
    links = soup.find_all('li', attrs={'class': 'o-listicle__item'})
    print(len(links))
    filename = "NEWS.csv"
    f = open(filename, "w", encoding='utf-8')
    headers = "Statement,Link,Date,Source,Label\n"
    f.write(headers)
    for j in links:
        Statement = j.find("div", attrs={'class': 'm-statement__quote'}).text.strip()
        if search_term.lower() not in Statement.lower():
            continue
        Link = "https://www.politifact.com"
        Link += j.find("div", attrs={'class': 'm-statement__quote'}).find('a')['href'].strip()
        Date = j.find('div', attrs={'class': 'm-statement__body'}).find('footer').text[-14:-1].strip()
        Source = j.find('div', attrs={'class': 'm-statement__meta'}).find('a').text.strip()
        Label = j.find('div', attrs={'class': 'm-statement__content'}).find('img', attrs={'class': 'c-image__original'}).get('alt').strip()
        frame.append((Statement, Link, Date, Source, Label))
        f.write(Statement.replace(",", "^") + "," + Link + "," + Date.replace(",", "^") + "," + Source.replace(",", "^") + "," + Label.replace(",", "^") + "\n")
    upperframe.extend(frame)
f.close()
data = pd.DataFrame(upperframe, columns=['Statement', 'Link', 'Date', 'Source', 'Label'])
data.head()

# Für Zusatzaufgabe 6 NLP

with open('APPL News.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Title', 'Sentiment', 'Sentiment Label'])  # schreibt die Spaltenüberschriften
    for title in titles:
        blob = TextBlob(title)
        sentiment = blob.sentiment.polarity
        if sentiment < -0.2:
            sentiment_label = 'negativ'
        elif sentiment > 0.2:
            sentiment_label = 'positiv'
        else:
            sentiment_label = 'neutral'
        writer.writerow([title, sentiment, sentiment_label])


processing page : 1
https://www.politifact.com/factchecks/list/?page=1
30
processing page : 2
https://www.politifact.com/factchecks/list/?page=2
30
processing page : 3
https://www.politifact.com/factchecks/list/?page=3
30
processing page : 4
https://www.politifact.com/factchecks/list/?page=4
30
processing page : 5
https://www.politifact.com/factchecks/list/?page=5
30
processing page : 6
https://www.politifact.com/factchecks/list/?page=6
30
processing page : 7
https://www.politifact.com/factchecks/list/?page=7
30
processing page : 8
https://www.politifact.com/factchecks/list/?page=8
30
processing page : 9
https://www.politifact.com/factchecks/list/?page=9
30
processing page : 10
https://www.politifact.com/factchecks/list/?page=10
30
processing page : 11
https://www.politifact.com/factchecks/list/?page=11
30
processing page : 12
https://www.politifact.com/factchecks/list/?page=12
30
processing page : 13
https://www.politifact.com/factchecks/list/?page=13
30
processing page : 14
https://w

## 2 Datenaufbereitung

### Entfernen NAs und Duplikate, Erstellen neuer Variablen, Anreicherung der Daten

In [None]:
df = data

# Descriptive Statistics
df.describe()
# Check for format and change it¶
df.info()

# Data cleaning --> Hier noch mehr Befehle suchen
df = data.drop_duplicates()
df['Date'] = pd.to_datetime(df['Date'])

## 3 DB - PostgreSQL DB initiate -> In Docker

In [None]:
# Libraries
import os
import fnmatch
import tempfile
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

os.environ['MPLCONFIGDIR'] = "/home/jovyan"
import matplotlib.pyplot as plt

# Settings
import warnings
warnings.filterwarnings("ignore")

# Connect DB
conn = psycopg2.connect("host=db dbname=postgres user=admin password=secret")

# Insert data to appl_prices
engine = create_engine('postgresql://admin:secret@db:5432/postgres')
data.to_sql('appl_prices', engine, if_exists='replace')
cur = conn.cursor()

# Spalte positive hinzufügen
cur.execute("ALTER TABLE appl_prices ADD COLUMN Positive INTEGER DEFAULT 0;")

# Änderungen speichern
conn.commit()

# Datenbankverbindung schliessen
cur.close()
conn.close()

In [None]:
# Connect DB
conn = psycopg2.connect("host=db dbname=postgres user=admin password=secret")

# Update DB
cur = conn.cursor()
cur.execute("""UPDATE appl_prices SET positive = CASE WHEN "Close" >= "Open" THEN 1 ELSE 0 END;""")

# Änderungen speichern
conn.commit()

# Datenbankverbindung schliessen
cur.close()
conn.close()

In [None]:
# Connect DB
conn = psycopg2.connect("host=db dbname=postgres user=admin password=secret")

# Selct DB content
cur = conn.cursor()
cur.execute("""SELECT * FROM appl_prices LIMIT 10;""")

rows = cur.fetchall()
for row in rows:
    print(row)

# Datenbankverbindung schliessen
cur.close()
conn.close()

## 4. EDA

In [None]:
#Daten aus DB lesen und bearbeiten
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_datareader import data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Exploratory data analysis
print(df.info())
print(df.describe())

# Plotting
sns.set_style('whitegrid')
plt.figure(figsize=(12,6))
plt.title('Apple Stock Price')
plt.xlabel('Year')

plt.ylabel('Price ($)')
sns.lineplot(data=df, x='Date', y='Close')
plt.show()

plt.figure(figsize=(12,6))
plt.title('Daily Change in Apple Stock Price')
plt.xlabel('Year')
plt.ylabel('Change in price ($)')
sns.lineplot(data=df, x='Date', y='Close').set(ylabel='Price ($)', xlabel='Year')
sns.lineplot(data=df, x='Date', y=df['Close'].diff()).set(ylabel='Change in price ($)', xlabel='Year')
plt.legend(labels=['Price', 'Daily Change'])
plt.show()

plt.figure(figsize=(12,6))
plt.title('Apple Stock Price Distribution')
sns.histplot(data=df, x='Close', bins=30)
plt.show()

# Split the data into training and test sets
X = df['Open'].values.reshape(-1, 1)
y = df['Close'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on test data
y_pred = model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f'R-squared: {r2:.2f}')
print

### 5. Verwendung eines ML Frameworks/Library & 6. Erstellen von Modellvorhersagen

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import time
import datetime
import psycopg2
import sqlite3
from pandas_datareader import data
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sqlalchemy import create_engine
from psycopg2 import connect, extensions

os.environ['MPLCONFIGDIR'] = "/home/jovyan"
import matplotlib.pyplot as plt

# Settings
import warnings
warnings.filterwarnings("ignore")

# Fetch data from Yahoo Finance
ticker = 'AAPL'
period1 = int(time.mktime(datetime.datetime(2010, 1, 1, 23, 59).timetuple()))
period2 = int(time.mktime(datetime.datetime(2022, 2, 1, 23, 59).timetuple()))
interval = '1d'
query_string = f'https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1={period1}&period2={period2}&interval={interval}&events=history&includeAdjustedClose=true'
data = pd.read_csv(query_string)

# Create DB
auto_commit = extensions.ISOLATION_LEVEL_AUTOCOMMIT
connection = psycopg2.connect("host=db dbname=postgres user=admin password=secret")
print(conn)
connection.set_isolation_level(auto_commit)
cur = connection.cursor()
query = """
    DROP DATABASE IF EXISTS task5;
    CREATE DATABASE task5;
"""
connection.commit()
connection.close()

# Connect DB
connection = psycopg2.connect("host=db dbname=task5 user=admin password=secret")

# Insert data to appl_prices
engine = create_engine('postgresql://admin:secret@db:5432/task5')
data.to_sql('appl_prices', engine, if_exists='replace')
cur = connection.cursor()

# Änderungen speichern
connection.commit()

# Datenbankverbindung schliessen
cur.close()
connection.close()

# Connect DB
connection = psycopg2.connect("host=db dbname=task5 user=admin password=secret")

# Selct DB content
cur = connection.cursor()
cur.execute("""SELECT * FROM appl_prices;""")
rows = cur.fetchall()
df = pd.DataFrame(rows, columns=[desc[0] for desc in cur.description])
cur.execute("""SELECT * FROM appl_prices LIMIT 10;""")
print(df)

rows = cur.fetchall()
for row in rows:
    print(row)
    
# Data cleaning
df['Date'] = pd.to_datetime(df['Date'])

# Plotting
sns.set_style('whitegrid')
plt.figure(figsize=(12,6))
plt.title('Apple Stock Price')
plt.xlabel('Year')
plt.ylabel('Price ($)')
sns.lineplot(data=df, x='Date', y='Close')
plt.show()

# Split the data into training and test sets
X = df['Open'].values.reshape(-1, 1)
y = df['Close'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to tensors
X_train_tensor = torch.from_numpy(X_train).float()
y_train_tensor = torch.from_numpy(y_train).float()

# Define the model architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(1, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)
        
    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)

# Train the model
num_epochs = 5000
for epoch in range(num_epochs):
    # Forward pass
    y_pred = net(X_train_tensor)
    loss = criterion(y_pred, y_train_tensor)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 100 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

# Make predictions
X_tensor = torch.from_numpy(X).float()

with torch.no_grad():
    y_pred_tensor = net(X_tensor)
    
y_pred = y_pred_tensor.numpy().flatten()

# Make predictions on test set
X_test_tensor = torch.from_numpy(X_test).float()
with torch.no_grad():
    y_test_pred_tensor = net(X_test_tensor)

y_test_pred = y_test_pred_tensor.numpy().flatten()

# Compute R2-score and MSE on test set
r2 = r2_score(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)

print(f"R2-score on test set: {r2:.4f}")
print(f"MSE on test set: {mse:.4f}")

# Plot predictions against true values
plt.figure(figsize=(12,6))
plt.title('Apple Stock Price Predictions')
plt.xlabel('Open Price ($)')
plt.ylabel('Close Price ($)')
sns.scatterplot(x=X_test.flatten(), y=y_test)
sns.lineplot(x=X_test.flatten(), y=y_test_pred, color='red')
plt.show()

# Datenbankverbindung schliessen
cur.close()
conn.close()

### 7. Evaluation der Modelle mit Hilfe geeigneter Modellgütemasse


In [None]:
#Das gegebene Python-Skript führt eine Regression durch, um die Schlusskurse von AAPL Aktien anhand der Eröffnungskurse vorherzusagen. Um die Modellevaluation durchzuführen, können wir R2-Score und Mean Squared Error (MSE) verwenden. Der R2-Score misst den Anteil der Varianz in der abhängigen Variable (y) , der durch das Modell erklärt wird, während der MSE den durchschnittlichen quadratischen Fehler zwischen den vorhergesagten Werten und den tatsächlichen Werten berechnet.

#In diesem Skript wird ein neuronales Netzwerk trainiert und validiert, um die Schließkurse vorherzusagen. Es gibt einen Trainings- und einen Testdatensatz. Nachdem das Modell trainiert wurde, werden Vorhersagen auf dem Testdatensatz gemacht, und der R2-Score und MSE werden berechnet. Das Modell wird dann grafisch dargestellt, um Vorhersagen gegen tatsächliche Werte zu vergleichen.

### 8. Korrekte Interpretation der Modellergebnisse und Modellgütemasse

In [None]:
#Der R2-Score und MSE können wie folgt interpretiert werden:

#Ein R2-Score von 1 bedeutet, dass das Modell alle Variationen in der abhängigen Variable erklärt und perfekt vorhersagt. Ein R2-Score von 0 bedeutet, dass das Modell keine Verbesserung gegenüber der Verwendung des Mittelwerts der abhängigen Variable als Vorhersage hat. Ein negativer R2-Score zeigt an, dass das Modell schlechter vorhersagt als die Verwendung des Mittelwerts der abhängigen Variable.
#Ein kleiner MSE zeigt an, dass das Modell die tatsächlichen Werte besser vorhersagt.
#Im Kontext dieses Skripts zeigt ein hoher R2-Score und ein niedriger MSE, dass das neuronale Netzwerk in der Lage ist, die Schlusskurse von AAPL Aktien basierend auf den Eröffnungskursen mit hoher Genauigkeit vorherzusagen.

## Zusatzpunkte

### Z.1

### Z.2 Docker (siehe Ordner Docker)

### Z.3 Integration und Visualisierung von geographischen Daten

In [None]:
import yfinance as yf
import folium
import requests
import webbrowser
import os
from bs4 import BeautifulSoup

# Get the Exchange from Yahoo Finance
ticker = yf.Ticker('AAPL').info
market_place = ticker['exchange']
print('Ticker:', ticker)
print('Ticker: AAPL')
print('Market Place:', market_place)

# Yahoo Finance API URL to get exchange symbols for AAPL stock
yahoo_api_url = 'https://finance.yahoo.com/quote/AAPL'

# Nominatim API URL to get geocoding data for exchange locations
nominatim_api_url = 'https://nominatim.openstreetmap.org/search'

# Get exchange symbols for AAPL stock
response = requests.get(yahoo_api_url)
soup = BeautifulSoup(response.content, 'html.parser')
exchange_symbols = market_place
print(exchange_symbols)

# OpenStreetMap URL to get location data for NMS stock exchange
#osm_url = f'https://www.openstreetmap.org/search?query=Cupertino'
osm_url = f'https://nominatim.openstreetmap.org/search.php?q={exchange_symbols}+stock+exchange&format=json'


# Get location data for NMS stock exchange
response = requests.get(osm_url)
location_data = response.json()[0]

# Extract latitude and longitude from location data
lat = float(location_data['lat'])
lon = float(location_data['lon'])

# Create a folium map centered on the NMS stock exchange
m = folium.Map(location=[lat, lon], zoom_start=16)

# Add a marker for the NMS stock exchange
folium.Marker(location=[lat, lon], tooltip='NMS stock exchange').add_to(m)

# Display the map
m
m.save('Exchange.html')
url = 'file://' + os.path.abspath('Exchange.html')
webbrowser.open(url)

### Z.4 CNN

In [None]:
# Bibliotheken importieren
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

# Daten einlesen
df = pd.read_csv('APPL Prices.csv')

# Datensatz auf die Spalte "Close" reduzieren
data = df.filter(['Close'])

# Datensatz in numpy-Array konvertieren
dataset = data.values

# Anzahl der Datensätze, die für das Training verwendet werden sollen
training_data_len = int(np.ceil( len(dataset) * 0.8 ))

# Skalierung der Daten
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset)

# Trainingsdaten erstellen
train_data = scaled_data[0:training_data_len, :]

# Aufteilung der Trainingsdaten in X_train und y_train
X_train = []
y_train = []

for i in range(60, len(train_data)):
    X_train.append(train_data[i-60:i, 0])
    y_train.append(train_data[i, 0])

X_train, y_train = np.array(X_train), np.array(y_train)

# LSTM-Modell erstellen
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(LSTM(50, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))

# Modell kompilieren
model.compile(optimizer='adam', loss='mean_squared_error')

# Modell trainieren
model.fit(X_train, y_train, batch_size=1, epochs=1)

# Testdaten erstellen
test_data = scaled_data[training_data_len - 60: , :]

X_test = []
y_test = dataset[training_data_len:, :]
for i in range(60, len(test_data)):
    X_test.append(test_data[i-60:i, 0])

# Konvertierung der Testdaten in numpy-Array
X_test = np.array(X_test)

# Hinzufügen einer zusätzlichen Dimension
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

# Vorhersage der Testdaten
predicted_price = model.predict(X_test)

# Inverse Skalierung der Vorhersage-Daten
predicted_price = scaler.inverse_transform(predicted_price)

# RMSE berechnen
rmse = np.sqrt(np.mean(((predicted_price - y_test) ** 2)))
print(rmse)

# Plot der Vorhersagen
train = data[:training_data_len]
valid = data[training_data_len:]
valid['Predictions'] = predicted_price

plt.figure(figsize=(16,8))
plt.title('LSTM-Modell')
plt.xlabel('Datum', fontsize=18)
plt.ylabel('Schlusskurs', fontsize=18)
plt.plot(train['Close'])
plt.plot(valid[['Close', 'Predictions']])
plt.legend(['Trainingsdaten', 'Testdaten', 'Vorhersagen'], loc='lower right')
plt.show()

### Z.5 Modellierungshypothesen und Modellierungsannahmen

Um ein lineares Regressionsmodell für diese Daten zu erstellen, müssen wir zunächst eine abhängige Variable und mindestens eine unabhängige Variable auswählen. Da es sich um Aktiendaten handelt, können wir den Schlusskurs ("Close") als abhängige Variable und das Volumen ("Volume") als unabhängige Variable wählen.

Wir können das Modell in Python mit der Bibliothek "statsmodels" erstellen. Hier ist der Code:

In [None]:
import statsmodels.api as sm
# Select DB content
cur = conn.cursor()
cur.execute("""SELECT * FROM appl_prices;""")

# Get column names
columns = [desc[0] for desc in cur.description]

# Fetch all rows
rows = cur.fetchall()

# Create DataFrame
df = pd.DataFrame(rows, columns=columns)

# Print DataFrame
#print(df)

X = df["Volume"]
y = df["Close"]

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())

Das Modell sieht folgendermaßen aus:

Close = β0 + β1 * Volume + ε

Die Konstante β0 wird automatisch von der Bibliothek hinzugefügt. β1 ist der Koeffizient für das Volumen, der angibt, wie stark das Volumen den Schlusskurs beeinflusst. ε ist der Fehlerterm.

Die Ausgabe des Modells sieht folgendermaßen aus:

In [None]:
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  Close   R-squared:                       0.023
Model:                            OLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     71.91
Date:                Fri, 06 May 2023   Prob (F-statistic):           2.98e-17
Time:                        [insert time]   Log-Likelihood:                -9592.2
No. Observations:                3042   AIC:                         1.919e+04
Df Residuals:                    3040   BIC:                         1.921e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           98.1182      2.079     47.211      0.000      94.047     102.190
Volume        1.305e-07   1.54e-08      8.478      0.000       1e-07    1.61e-07
==============================================================================
Omnibus:                     1278.244   Durbin-Watson:                   0.096
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            10287.315
Skew:                          -1.819   Prob(JB):                         0.00
Kurtosis:                      11.162   Cond. No.                     2.08e+09
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.22e+09. This might indicate that there are
strong multicollinearity or other numerical problems.

Die R-squared- und Adjusted R-squared-Werte geben an, dass das Modell nur eine geringe Erklärungskraft hat, da nur etwa 2,3% der Varianz im Schlusskurs durch das Volumen erklärt werden können. Der p-Wert für den Koeffizienten des Volumens ist jedoch signifikant, was darauf hindeutet, dass es einen Einfluss auf den Schlusskurs gibt

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Daten laden
# df = pd.read_csv('appl_prices.csv')

# Trainings- und Testdaten aufteilen
train_size = int(len(df) * 0.8)
train = df[:train_size]
test = df[train_size:]

# Modell initialisieren und trainieren
model = LinearRegression()
features = ['Open', 'High', 'Low', 'Volume']
target = 'Close'
model.fit(train[features], train[target])

# Vorhersagen treffen
predictions = model.predict(test[features])

# Ergebnisse auswerten
mse = ((predictions - test[target]) ** 2).mean()
print(f'MSE: {mse:.2f}')

#### Regressionsdiagramm

In [None]:
import matplotlib.pyplot as plt
from matplotlib.dates import date2num
import seaborn as sns

# Date-Spalte in Datetime-Datentyp konvertieren
df['Date'] = pd.to_datetime(df['Date'])

# Date-Spalte in numerisches Format konvertieren
df['num_date'] = df['Date'].apply(lambda date: date2num(date))

# Date-Spalte entfernen
#df.drop('Date', axis=1, inplace=True)

# Daten plotten
sns.regplot(x='num_date', y='Close', data=df)

# Plot-Parameter einstellen
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.title('Linear Regression of AAPL Stock Prices')

# X-Achsenticks einstellen
xticks = df.iloc[::150, :]['Date']
xticks = pd.to_datetime(xticks)  # Spalte in Datumsobjekte konvertieren
xticklabels = [date.strftime('%Y-%m-%d') for date in xticks]
plt.xticks(xticks, xticklabels, rotation=45)

# Plot anzeigen
plt.show()

#### Vorhersage

In [None]:
import matplotlib.pyplot as plt

# Streudiagramm erstellen
plt.scatter(test['Date'], test['Close'], color='gray')

# Regressionsgerade erstellen
plt.plot(test['Date'], predictions, color='red', linewidth=2)

# Achsenbeschriftungen
plt.xlabel('Date')
plt.ylabel('Close')
plt.title('Predictions of AAPL Stock Prices')

# X-Achsenticks einstellen
xticks = test.iloc[::120, :]['Date']
plt.xticks(xticks)

# Diagramm anzeigen
plt.show()

### Z.6 NLP

#### 1. Obtain Data

In [None]:
import pandas as pd
df = pd.read_csv("APPL News.csv")
df.head()

#### 2. Exploratory Data Analyxis (EDA)

In [None]:
df.info()
df["Sentiment"].value_counts()

#### 3. Data Preparation

In [None]:
df["label"] = df["Sentiment Label"].apply(lambda input: "positive" if input == "Positive" else "notpositive")
df = df[["Title", "label"]]
df.head()

#### 4. Model fitting

In [None]:
from sklearn.model_selection import train_test_split

X = df["Title"]
y = df ["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=17)

#### 5. Model Evaluation

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([("vectoriser", TfidfVectorizer()), ("model", MultinomialNB())])
pipeline.fit(X_train, y_train)

#### 6. Model Application

In [None]:
prediction = pipeline.predict(["The new Iphone has an error"])
print(prediction)