In [None]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import libraries
import time
import pandas as pd
import visuals as vs
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Pretty display for notebooks
%matplotlib inline

In [None]:
# Load the stock dataset
data = pd.read_csv("stockDataset_v2.csv")

In [None]:
# Display the sample data - records
display(data.tail(n=5))

In [None]:
# Display data features
data.info()

In [None]:
# Features : [stock_industry, stock_market_cap, stock_risk_level]
# Target: "opinion"

In [None]:
# Total count of records
n_total = len(data)

# Count of records where opinion is bad
n_bad = len(data[data['opinion'] == 'bad'])

# Count of records where opinion is good
n_good = len(data[data['opinion'] == 'good'])

# Percentage of stocks with good opinion
p_good = 100 * n_good / n_total

# Print the results
print("Total number of records                        = {}".format(n_total))
print("Stocks with bad opinion to buy                 = {}".format(n_bad))
print("Stocks with good opinion to buy                = {}".format(n_good))
print("Percentage of stocks with good opinion         = {:.2f}%".format(p_good))

In [None]:
# Add inputs to the data
inputs = pd.DataFrame({'stock_symbol':["testSymbol"],
                       'stock_name':["testName"],
                       'stock_industry':["Consumer Staples"],
                       'stock_market_cap':["Midcap"],
                       'stock_risk_level':["Low Risk"]})

frames = [data, inputs]
data = pd.concat(frames)

In [None]:
# Reset index values
data.reset_index(drop=True, inplace=True)

In [None]:
# Selecting target and feature variables
opinion = data['opinion']
feature = data.drop(['stock_symbol', 'stock_name', 'opinion'], axis=1)

In [None]:
# One-hot encoding
feature = pd.get_dummies(feature)

# Encode the 'raw_income' data to numerical values
opinion = opinion.apply(lambda x: 1 if x == 'good' else 0)

# Print the number of train features after one-hot encoding
encoded_features = list(feature.columns)
print("Total train features after one-hot encoding = {}".format(len(encoded_features)))

In [None]:
# List of features encoded
for e_feature in encoded_features:
    print(e_feature)

In [None]:
# Display samples data after one hot encoding
display(feature.tail(n=5))

In [None]:
# Data splitting: Train data and Test data
from sklearn.model_selection import train_test_split

# Splitting ratio = (train : test) = (80 : 20) [X - features; Y - target variable]
X_train, X_test, Y_train, Y_test = train_test_split(feature, opinion, test_size = 0.000276319, shuffle=False)

# Split details
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

In [None]:
# Create model
predSuggestion = KNeighborsClassifier(n_neighbors=3)

In [None]:
# Train model
predSuggestion.fit(X_train, Y_train)

In [None]:
# Predictions for test data
predicted= predSuggestion.predict(X_test) # 0:Overcast, 2:Mild
print(predicted[0])