In [1]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### Step 1:  Read in the `tsx-energy-2018.csv` file and create a DataFrame.

In [3]:
# Read the CSV file into a Pandas DataFrame
# Set the index using the Ticker column
df_stocks = pd.read_csv(
    Path("../Resources/tsx-energy-2018.csv")
)


# Review the DataFrame
df_stocks.head()



Unnamed: 0,Ticker,CompanyName,MeanOpen,MeanHigh,MeanLow,MeanClose,MeanVolume,AnnualReturn,AnnualVariance,EnergyType
0,ARX,ARC Resources Ltd.,13.14,13.34,12.91,13.1,1479913.38,-0.7275,0.359,Oil
1,CCO,Cameco Corporation,13.7,13.92,13.5,13.7,1203788.22,0.2014,0.3693,Other Energy
2,CNQ,Canadian Natural Resources Limited,41.97,42.46,41.46,41.91,3218248.68,-0.3461,0.2947,Oil
3,CVE,Cenovus Energy Inc.,11.96,12.18,11.75,11.95,4566143.56,-0.3219,0.45,Oil
4,CPG,Crescent Point Energy Corp.,8.53,8.67,8.36,8.5,3919414.03,-1.0103,0.4597,Other Energy


In [4]:
df_stocks.shape

(24, 10)

### Step 2: Scale the `df_stocks` DataFrame and create a new DataFrame that contains the scaled data. 

In [5]:
# Scale price data, return, and variance values
scaled_data = StandardScaler().fit_transform(df_stocks[['MeanOpen', 'MeanHigh', 'MeanLow', 'MeanClose', 'MeanVolume', 'AnnualReturn', 'AnnualVariance']])



In [6]:
# Create a DataFrame with the scaled data
df_stocks_scaled = pd.DataFrame(
    scaled_data, columns=['MeanOpen', 'MeanHigh', 'MeanLow', 'MeanClose', 'MeanVolume', 'AnnualReturn', 'AnnualVariance']
)
# Copy the tickers names from the original data
df_stocks_scaled['Ticker'] = df_stocks['Ticker'].values

# Set the Ticker column as index
df_stocks_scaled = df_stocks_scaled.set_index('Ticker')

# Display sample data
df_stocks_scaled.head()


Unnamed: 0_level_0,MeanOpen,MeanHigh,MeanLow,MeanClose,MeanVolume,AnnualReturn,AnnualVariance
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ARX,-0.916832,-0.917217,-0.918045,-0.918135,-0.152786,-1.332445,0.460854
CCO,-0.880152,-0.879472,-0.879062,-0.878786,-0.379117,1.695742,0.559411
CNQ,0.971524,0.977848,0.968315,0.971255,1.272074,-0.089092,-0.154415
CVE,-0.994122,-0.992707,-0.994689,-0.993553,2.376902,-0.010201,1.331607
CPG,-1.218785,-1.22113,-1.218673,-1.219807,1.846798,-2.254365,1.424424


In [10]:
# Encode the "EnergyType" column to variables to categorize oil versus non-oil firms.
encoded_df = df_stocks_scaled.copy()

encoded_df['EnergyType'] = encoded_df['Ticker'].apply(lambda x: 1 if x[-1] == 'O' else 0)



KeyError: 'Ticker'

In [7]:
# Concatenate the "EnergyType" variables with the scaled data DataFrame.


# Display the sample data



Unnamed: 0_level_0,MeanOpen,MeanHigh,MeanLow,MeanClose,MeanVolume,AnnualReturn,AnnualVariance,Oil,Other Energy
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ARX,-0.916832,-0.917217,-0.918045,-0.918135,-0.152786,-1.332445,0.460854,1,0
CCO,-0.880152,-0.879472,-0.879062,-0.878786,-0.379117,1.695742,0.559411,0,1
CNQ,0.971524,0.977848,0.968315,0.971255,1.272074,-0.089092,-0.154415,1,0
CVE,-0.994122,-0.992707,-0.994689,-0.993553,2.376902,-0.010201,1.331607,1,0
CPG,-1.218785,-1.22113,-1.218673,-1.219807,1.846798,-2.254365,1.424424,0,1


### Step 3: Initialize the K-means model with three clusters and then fit the `df_stocks_scaled` DataFrame to the model.

In [8]:
# Initialize the K-Means model with n_clusters=3



In [9]:
# Fit the model for the df_stocks_scaled DataFrame



KMeans(n_clusters=3)

### Step 4. Predict the clusters and then create a new DataFrame with the predicted clusters.

In [10]:
# Predict the model segments (clusters)


# View the stock segments



[1 0 2 1 1 2 1 0 0 2 0 0 1 0 0 0 2 0 1 2 2 2 2 1]


In [11]:
# Create a new column in the DataFrame with the predicted clusters


# Review the DataFrame



Unnamed: 0_level_0,MeanOpen,MeanHigh,MeanLow,MeanClose,MeanVolume,AnnualReturn,AnnualVariance,Oil,Other Energy,StockCluster
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ARX,-0.916832,-0.917217,-0.918045,-0.918135,-0.152786,-1.332445,0.460854,1,0,1
CCO,-0.880152,-0.879472,-0.879062,-0.878786,-0.379117,1.695742,0.559411,0,1,0
CNQ,0.971524,0.977848,0.968315,0.971255,1.272074,-0.089092,-0.154415,1,0,2
CVE,-0.994122,-0.992707,-0.994689,-0.993553,2.376902,-0.010201,1.331607,1,0,1
CPG,-1.218785,-1.22113,-1.218673,-1.219807,1.846798,-2.254365,1.424424,0,1,1


### Step 5: Create a scatter plot to visualize the "StockCluster" using  "AnnualVariance" as the x-variable and "Annual Return" as the y-variable.  Be sure to style and format your plot.

In [12]:
# Create a scatter plot with x="AnnualVariance:,  y="AnnualReturn"



### Step 6: Reduce the number of features to two principal components on the `df_stocks_scaled` DataFrame, and  calculate the explained variance ratio that results from the PCA data.

In [13]:
# Create the PCA model instance where n_components=2



In [14]:
# Fit the df_stocks_scaled data to the PCA


# Review the first five rose of the PCA data
# using bracket notation ([0:5])



array([[-1.91504455,  0.7957083 ],
       [-1.87463613, -1.47756556],
       [ 2.11840035,  1.49590303],
       [-2.11973139,  2.16943585],
       [-2.87954095,  2.45437371]])

In [15]:
# Calculate the explained variance



array([0.60077431, 0.1901621 ])

### Step 7: Use the calculate PCA DataFrame in Step 6 to create a new DataFrame called, `df_stocks_pca`, then add an additional column to the `df_stocks_pca` DataFrame that contains the tickers from the original `df_stocks` DataFrame.

In [16]:
# Creating a DataFrame with the PCA data


# Copy the tickers names from the original data


# Set the Ticker column as index


# Review the DataFrame



Unnamed: 0_level_0,PC1,PC2
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
ARX,-1.915045,0.795708
CCO,-1.874636,-1.477566
CNQ,2.1184,1.495903
CVE,-2.119731,2.169436
CPG,-2.879541,2.454374


### Step 8: Rerun the K-means algorithm on the `df_stocks_pca` DataFrame and create a scatter plot using the  "StockCluster" and the two principal components for the x- and y-axes. Be sure to style and format your plot.

In [17]:
# Initialize the K-Means model with n_clusters=3


# Fit the model for the df_stocks_pca DataFrame


# Predict the model segments (clusters)


# Print the stock segments



[0 2 1 0 0 1 0 2 2 1 2 2 0 2 2 2 1 2 0 1 1 1 1 0]


In [18]:
# Create a copy of the df_stocks_pca DataFrame and name it as df_stocks_pca_predictions
df_stocks_pca_predictions = df_stocks_pca.copy()

# Create a new column in the DataFrame with the predicted clusters
df_stocks_pca_predictions["StockCluster"] = stock_clusters

# Review the DataFrame
df_stocks_pca_predictions.head()

Unnamed: 0_level_0,PC1,PC2,StockCluster
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ARX,-1.915045,0.795708,0
CCO,-1.874636,-1.477566,2
CNQ,2.1184,1.495903,1
CVE,-2.119731,2.169436,0
CPG,-2.879541,2.454374,0


In [19]:
# Create the scatter plot with x="PC1" and y="PC2"




**Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

**Answer:** We can conclude that we can use less features and get a similar performance to the original model since we can clearly identify three clusters.

### Bonus

* Use the elbow method to find the best value for `k` using the PCA data. Use a range from 1 to 11.

* Plot a line chart with all the inertia values computed with the different values of k to visually identify the optimal value for `k`.

In [20]:
# Create a list with the number of k-values to try
# Use a range from 1 to 11



In [21]:
# Create an empy list to store the inertia values



In [22]:
# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_stocks_pca`
# 3. Append the model.inertia_ to the inertia list



In [23]:
# Create a dictionary with the data to plot the Elbow curve


# Create a DataFrame with the data to plot the Elbow curve



In [24]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.



**Question:** What is the best value for k when using the PCA data? Does it differ from the best k value found using the original data?

**Answer:** Based on this Elbow Curve, it looks like `k=3` is still the correct one.