<a href="https://colab.research.google.com/github/tingleica/FundingAnalysis/blob/main/Mortgage_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import requests as rq
import zipfile as zf
import io
import os

In [None]:
treasury_yield_list = ["1M", "3M", "6M", "1Y", "2Y", "3Y", "5Y", "7Y", "10Y", "20Y", "30Y"]

In [None]:
columns_list = ["Date", "Type", "Issuer", "Coupon"] + treasury_yield_list + ["WAVGPrice"]

In [None]:
coupon_list = ["4", "4.5", "5", "5.5", "6"]

In [None]:
def get_date(filename):
    import re
    from datetime import datetime

    match_str = re.search(r'\d{8}', filename)
    res = datetime.strptime(match_str.group(), '%Y%m%d').date()
    return (str(res.year) + "-" + str(res.month) + "-" + str(res.day))

In [None]:
def format_yield_date(unformatted_date):
    import re
    from datetime import datetime

    match_str = re.search(r'\d{2}-\d{2}-\d{4}', unformatted_date)
    res = datetime.strptime(match_str.group(), '%d-%m-%Y').date()
    return (str(res.year) + "-" + str(res.month) + "-" + str(res.day))

In [None]:
def fetch_coupon_yields_price_df(asofdate, sfh_type, issuer, coupon, treasury_yields, price):
    df = pd.DataFrame([[asofdate, sfh_type, issuer, coupon] + treasury_yields + [price]], columns=columns_list)
    return df

In [None]:
def print_model_debug_info(model_info, X_df, y_df, X_train_df, y_train_df, X_test_df, y_test_df, y_train_pred, y_test_pred, r2_score_train, r2_score_test):
    print("Testing " + model_info + " ...")

    #print("Printing X_df ...")
    #print(X_df.to_string())

    #print("Printing y_df ...")
    #print(y_df.to_string())

    #print("Printing X_train_df ....")
    #print(X_train_df.to_string())

    #print("Printing X_test_df ....")
    #print(X_test_df.to_string())

    #print("Printing y_train_df ....")
    #print(y_train_df.to_string())

    #print("Printing y_train_pred ...")
    #print(y_train_pred)

    #print("Printing y_test_df ....")
    #print(y_test_df.to_string())

    #print("Printing y_test_pred ...")
    #print(y_test_pred)

    print("Printing r2_score for y_train and y_train_pred ...")
    print("Coefficient of determination: %.2f" % r2_score_train)

    print("Printing r2_score for y_test and y_test_pred ...")
    print("Coefficient of determination: %.2f" % r2_score_test)

In [None]:
def scikit_linear_regression(all_df):
    import matplotlib.pyplot as plt
    import numpy as np

    from sklearn import datasets, linear_model
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error, r2_score

    X_df = all_df[["Type", "Issuer", "Coupon", "1M", "3M", "6M", "1Y", "2Y", "3Y", "5Y", "7Y", "10Y", "20Y", "30Y"]]
    Y_df = all_df[["WAVGPrice"]]

    X_df = pd.get_dummies(data=X_df, drop_first=True)

    X_df_train, X_df_test, Y_df_train, Y_df_test = train_test_split(X_df, Y_df, test_size = .20, random_state = 40)

    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Train the model using the training sets
    regr.fit(X_df_train, Y_df_train)

    # Make predictions using the training set
    Y_df_train_pred = regr.predict(X_df_train)

    # Make predictions using the testing set
    Y_df_test_pred = regr.predict(X_df_test)

    print_model_debug_info(model_info = "Linear regression",
                           X_df = X_df,
                           y_df = Y_df,
                           X_train_df = X_df_train,
                           y_train_df = Y_df_train,
                           X_test_df = X_df_test,
                           y_test_df = Y_df_test,
                           y_train_pred = Y_df_train_pred,
                           y_test_pred = Y_df_test_pred,
                           r2_score_train = r2_score(Y_df_train, Y_df_train_pred),
                           r2_score_test = r2_score(Y_df_test, Y_df_test_pred))

In [None]:
def scikit_decision_tree_regression(all_df):
    import matplotlib.pyplot as plt
    import numpy as np

    from sklearn import datasets, tree
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error, r2_score

    X_df = all_df[["Type", "Issuer", "Coupon", "1M", "3M", "6M", "1Y", "2Y", "3Y", "5Y", "7Y", "10Y", "20Y", "30Y"]]
    y_df = all_df[["WAVGPrice"]]

    X_df = pd.get_dummies(data=X_df, drop_first=True)

    X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X_df, y_df, test_size = .25, random_state = 13)

    # Create linear regression object
    regr = tree.DecisionTreeRegressor()

    # Train the model using the training sets
    regr.fit(X_train_df, y_train_df)

    # Make predictions using the training set
    y_train_pred_df = regr.predict(X_train_df)

    # Make predictions using the testing set
    y_test_pred_df = regr.predict(X_test_df)

    print_model_debug_info(model_info = "Decision Tree regression",
                           X_df = X_df,
                           y_df = y_df,
                           X_train_df = X_train_df,
                           y_train_df = y_train_df,
                           X_test_df = X_test_df,
                           y_test_df = y_test_df,
                           y_train_pred = y_train_pred_df,
                           y_test_pred = y_test_pred_df,
                           r2_score_train = r2_score(y_train_df, y_train_pred_df),
                           r2_score_test = r2_score(y_test_df, y_test_pred_df))

In [None]:
def scikit_random_forest_regression(all_df):
    #import matplotlib.pyplot as plt
    import pandas as pd
    import numpy as np

    from sklearn.model_selection import train_test_split
    from sklearn.metrics import r2_score
    from sklearn.model_selection import GridSearchCV
    from sklearn.preprocessing import OneHotEncoder

    from sklearn.ensemble import RandomForestRegressor

    all_df['Type'] = all_df['Type'].astype('category')
    all_df['Issuer'] = all_df['Issuer'].astype('category')

    # Assigning numerical values and storing it in another column
    all_df['Type_numerical'] = all_df['Type'].cat.codes
    all_df['Issuer_numerical'] = all_df['Issuer'].cat.codes

    # Create an instance of One-hot-encoder
    #enc = OneHotEncoder()

    # Passing encoded columns

    #enc_data = pd.DataFrame(enc.fit_transform(
    #all_df[['Type_numerical', 'Issuer_numerical']]).toarray())

    # Merge with main
    #new_all_df = all_df.join(enc_data)

    #print(new_all_df.to_string())

    #X = all_df.iloc[:, :-1]
    #y = all_df.iloc[:, -1]

    X_df = all_df[["Type_numerical", "Issuer_numerical", "Coupon", "1M", "3M", "6M", "1Y", "2Y", "3Y", "5Y", "7Y", "10Y", "20Y", "30Y"]]
    y_df = all_df[["WAVGPrice"]]


    # X_df_train, X_df_test, Y_df_train, Y_df_test = train_test_split(X_df, Y_df, test_size = .20, random_state = 1)
    X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size = 0.2, random_state = 1)

    regr = RandomForestRegressor(n_estimators =100, random_state=0)
    regr.fit(X_train, y_train)

    # Make predictions using the training set
    y_train_pred = regr.predict(X_train)

    # Make predictions using the testing set
    y_test_pred = regr.predict(X_test)

    print_model_debug_info(model_info = "Random Forest regression",
                           X_df = X_df,
                           y_df = y_df,
                           X_train_df = X_train,
                           y_train_df = y_train,
                           X_test_df = X_test,
                           y_test_df = y_test,
                           y_train_pred = y_train_pred,
                           y_test_pred = y_test_pred,
                           r2_score_train = r2_score(y_train, y_train_pred),
                           r2_score_test = r2_score(y_test, y_test_pred))

In [None]:
#all_df = extract_mbs_data_from_file(test_file, all_treasury_yields)

In [None]:
#all_df = pd.read_excel("datasets//finra_data.xlsx")
all_df = pd.read_excel("/content/finra_data.xlsx")
print(all_df.to_string())

In [None]:
#scikit_linear_regression(all_df)

In [None]:
scikit_decision_tree_regression(all_df)

Testing Decision Tree regression ...
Printing r2_score for y_train and y_train_pred ...
Coefficient of determination: 1.00
Printing r2_score for y_test and y_test_pred ...
Coefficient of determination: 0.80


In [None]:
scikit_random_forest_regression(all_df)

  regr.fit(X_train, y_train)


Testing Random Forest regression ...
Printing r2_score for y_train and y_train_pred ...
Coefficient of determination: 0.98
Printing r2_score for y_test and y_test_pred ...
Coefficient of determination: 0.89
