In [None]:
# Ignore warning
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Libraries and modules required
import numpy as np
import pandas as pd
from time import time

In [None]:
# Libraries to visualize the data
import visuals as vs
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Pretty display for notebooks
%matplotlib inline

In [None]:
# Load the stock dataset
data = pd.read_csv("stockDataset_v2.csv")

In [None]:
# Display the sample data - records
display(data.head(n=5))

In [None]:
data.info()

In [None]:
# Features : [stock_industry, stock_market_cap, stock_risk_level]
# Target: "opinion"

In [None]:
# Total count of records
n_total = len(data)

# Count of records where opinion is bad
n_bad = len(data[data['opinion'] == 'bad'])

# Count of records where opinion is good
n_good = len(data[data['opinion'] == 'good'])

# Percentage of stocks with good opinion
p_good = 100 * n_good / n_total

# Print the results
print("Total number of records                        = {}".format(n_total))
print("Stocks with bad opinion to buy                 = {}".format(n_bad))
print("Stocks with good opinion to buy                = {}".format(n_good))
print("Percentage of stocks with good opinion         = {:.2f}%".format(p_good))

<h1><u>Data Preprocessing</u></h1>

In [None]:
data.info()

In [None]:
raw_opinion = data['opinion']
raw_features = data.drop(['stock_symbol', 'stock_name', 'opinion'], axis=1)

In [None]:
# One-hot encoding
features = pd.get_dummies(raw_features)

# Encode the 'raw_income' data to numerical values
opinion = raw_opinion.apply(lambda x: 1 if x == 'good' else 0)

# Print the number of features after one-hot encoding
encoded_features = list(features.columns)
print("Total features after one-hot encoding = {}".format(len(encoded_features)))

In [None]:
# List of features encoded
for feature in encoded_features:
    print(feature)

In [None]:
display(features.head(n=5))

In [None]:
# Data splitting: Train data and Test data
from sklearn.model_selection import train_test_split

# Splitting ratio = (train : test) = (80 : 20) [X - features; Y - target variable]
X_train, X_test, Y_train, Y_test = train_test_split(features, opinion, test_size = 0.20, random_state = 0)

# Split details
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

In [None]:
# Calculate naive predictor perforamance
TP = np.sum(opinion) 
FP = opinion.count() - TP
# As considired naive case, no negative predictions
TN = 0 
FN = 0 

# Calculate accuracy, precision and recall
accuracy = (TP + TN) / (TP + FP + TN + FN)
recall = TP / (TP + FN)
precision = TP / (TP + FP)

# Calculate F-score
l_rate = 0.5
fscore = (1 + l_rate**2) * ((precision * recall) / ((l_rate**2) * precision + recall))

# Results
print("-------------------------\nNaive Predictor: \n-------------------------\nRecall          =  {:.3f}\nPrecision       =  {:.3f}\nAccuracy score  =  {:.3f}\nF-score         =  {:.3f}\n-------------------------".format(recall, precision, accuracy, fscore))

In [None]:
# Importing fbeta score and accuracy score from sklearn
from sklearn.metrics import fbeta_score, accuracy_score

def train_predict(learner, sample_size, X_train, Y_train, X_test, Y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - Y_train: income training set
       - X_test: features testing set
       - Y_test: income testing set
    '''
    
    results = {}
    
    # Fit the learner to the training data with 'sample_size'
    start = time() 
    learner = learner.fit(X_train[:sample_size],Y_train[:sample_size])
    end = time() 
    
    # Calculate the training time
    results['train_time'] = end - start
        
    # Get the predictions on the test set, then get predictions on the first 300 training samples
    start = time() 
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    end = time()
    
    # Calculate the total prediction time, accuracy and F-Score on the first 300 training samples and test set
    results['pred_time'] = end - start
    results['acc_train'] = accuracy_score(Y_train[:300],predictions_train)        
    results['acc_test'] = accuracy_score(Y_test,predictions_test)
    results['f_train'] = fbeta_score(Y_train[:300],predictions_train,0.5)
    results['f_test'] = fbeta_score(Y_test,predictions_test,0.5)
       
    # Mini batches
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
        
    # Return the results
    return results

In [None]:
from sklearn.neighbors import KNeighborsClassifier as KNN
learner = KNN(n_neighbors=7)

In [None]:
# Divide the batch samples into three types of batches
samples_100 = len(Y_train)
samples_10 = int(len(Y_train)/10)
samples_1 = int(len(Y_train)/100)

In [None]:
print(type(X_test))

In [None]:
result = train_predict(KNN(n_neighbors=7), len(Y_train), X_train, Y_train, X_test, Y_test)

In [None]:
# Collect results on the learners
results = {}
for learner in [learner]:
    learner_name = learner.__class__.__name__
    results[learner_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[learner_name][i] = \
        train_predict(learner, samples, X_train, Y_train, X_test, Y_test)

In [None]:
# Visualising the two supervised learning models chosen
vs.evaluate(results, accuracy, fscore)

In [None]:
# Display the performance numericals of the two supervised learning models chosen
print("--------------------------------------------------")
for i in results.items():
    print(i[0])
    display(pd.DataFrame(i[1]).rename(columns={0:'1%', 1:'10%', 2:'100%'}))
    print("--------------------------------------------------")

In [None]:
# Visualizing the confusion matrix for each classifier
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

print("--------------------------------------------------")
for i,model in enumerate([learner]):
    results = confusion_matrix(Y_test, model.predict(X_test))   
    print('Confusion matrix for model: {}'.format(model.__class__.__name__));
    print(results) 
    print('Accuracy Score :',accuracy_score(Y_test, model.predict(X_test)))
    print("--------------------------------------------------")