<a href="https://colab.research.google.com/github/tingleica/FundingAnalysis/blob/main/MBSModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load the data
df = pd.read_excel("/content/finra_data.xlsx")

# One-hot encode the categorical columns 'Type', 'Issuer', and 'Coupon'
df = pd.get_dummies(df, columns=['Type', 'Issuer','Coupon'])

# Split the data into training and test sets
X = df.drop(['WAVGPrice', 'Date'], axis=1)
y = df['WAVGPrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Create the XGBoost regressor
reg = xgb.XGBRegressor()

# Train the model
reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = reg.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print('R-squared:', r2)


MSE: 1.5918756161872298
R-squared: 0.8788751715353479


Using Cross Validation

In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load the data
df = pd.read_excel("/content/finra_data.xlsx")

# One-hot encode the categorical columns 'Type','Issuer' and 'Coupon'
df = pd.get_dummies(df, columns=['Type', 'Issuer','Coupon'])

X = df.drop(['WAVGPrice', 'Date'], axis=1)
y = df['WAVGPrice']

# Create the XGBoost regressor
reg = xgb.XGBRegressor()

# Perform cross-validation
scores = cross_val_score(reg, X, y, cv=10, scoring='neg_mean_squared_error', verbose=3)  # 5-fold cross-validation

# The 'scoring' parameter is set to 'neg_mean_squared_error' to get the negative MSE for each fold

# Calculate the mean squared error and R-squared from the cross-validation scores
mse_mean = -scores.mean()
r2_mean = cross_val_score(reg, X, y, cv=5, scoring='r2').mean()

print('MSE (Cross-Validated):', mse_mean)
print('R-squared (Cross-Validated):', r2_mean)

[CV] END ............................... score: (test=-2.326) total time=   0.1s
[CV] END ............................... score: (test=-2.539) total time=   0.1s
[CV] END ............................... score: (test=-2.122) total time=   0.1s
[CV] END ............................... score: (test=-1.539) total time=   0.1s
[CV] END ............................... score: (test=-1.677) total time=   0.1s
[CV] END ............................... score: (test=-2.032) total time=   0.1s
[CV] END ............................... score: (test=-2.032) total time=   0.1s
[CV] END ............................... score: (test=-1.500) total time=   0.1s
[CV] END ............................... score: (test=-1.594) total time=   0.1s
[CV] END ............................... score: (test=-2.689) total time=   0.1s
MSE (Cross-Validated): 2.0049248197376652
R-squared (Cross-Validated): 0.8602965250282624
