#Set 1

###1.Import data from github <br>2. Import into a pandas dataframe <br>3. Assign column names

In [178]:
import pandas as pd

names = ['numDoors', 'numCylinders', 'mpgHighway', 'mpgStreets', 'currentMileage', 'avgMileagePerTuneUp', 'qualityRating']
df = pd.read_csv('https://raw.githubusercontent.com/thesabermaniac/PandasAndBasicML/main/Blank_Car_Info.csv', names=names)

print(df)

      numDoors  numCylinders  ...  avgMileagePerTuneUp  qualityRating
0            3             1  ...                  431       0.369963
1            1             2  ...                  934       0.754573
2            1             3  ...                  915       0.886215
3            2             3  ...                   10       0.971624
4            2             2  ...                   10       0.848398
...        ...           ...  ...                  ...            ...
1075         4             1  ...                  283       0.518708
1076         4             2  ...                  431       0.540775
1077         2             2  ...                   10       0.919054
1078         1             1  ...                   10       0.828795
1079         2             3  ...                  934       0.116761

[1080 rows x 7 columns]


###4. Check for null values

In [179]:
is_null = df.isnull()
print(is_null.sum())

numDoors               0
numCylinders           0
mpgHighway             0
mpgStreets             0
currentMileage         0
avgMileagePerTuneUp    0
qualityRating          0
dtype: int64


###5. Run kmeans clustering

In [180]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4).fit(df.values)
cc = kmeans.cluster_centers_
print(cc)

[[2.35379061e+00 2.37184116e+00 2.68772563e+01 2.58032491e+01
  2.52176895e+02 2.45373646e+02 5.06421092e-01]
 [2.41981132e+00 2.42924528e+00 2.62971698e+01 2.72169811e+01
  2.48825472e+02 8.66966981e+02 5.46785990e-01]
 [2.26724138e+00 2.49137931e+00 2.61853448e+01 2.52155172e+01
  8.54737069e+02 2.34806034e+02 4.89035414e-01]
 [2.34146341e+00 2.47560976e+00 2.70731707e+01 2.67682927e+01
  8.75695122e+02 8.72621951e+02 5.06455425e-01]]


###6. Find cluster for new data

In [181]:
import numpy as np


new_data = np.array([2, 4, 500, 300, 150, 760, .6])
pred = kmeans.predict([[2, 4, 500, 300, 150, 760, .6]])
print(pred)

[1]


###7. SVM

In [182]:
from sklearn import svm

df['qualityRating'] = pd.cut(df['qualityRating'], 3, labels=['bad', 'average', 'good'])
svm = svm.SVC().fit(df.values[:,:-1], df.values[:, -1])

new_car = svm.predict([[2, 3, 100, 900, 500, 45]])
print(new_car)

['good']


#Set 2

###1. Download Yahoo Data<br>2. Upload to github<br>3. Place data in pandas dataframe<br>4. Output statistics for each column

In [183]:
stock_prices = pd.read_csv('https://raw.githubusercontent.com/thesabermaniac/PandasAndBasicML/main/TickerData.csv')
print(stock_prices.describe())


              TSLA         AAPL         AMZN        GOOGL         NFLX
count  1259.000000  1259.000000  1259.000000  1259.000000  1259.000000
mean    124.596465    54.767843  1631.599848  1132.494996   284.875163
std     168.723593    28.823262   776.167588   287.578670   133.982203
min      35.793999    22.584999   552.080017   681.140015    85.330002
25%      49.028000    36.445002   953.474976   936.875000   153.870003
50%      62.644001    44.915001  1641.089966  1104.510010   301.049988
75%      74.207000    63.637498  1906.724976  1258.955017   367.649994
max     883.090027   143.160004  3531.449951  2118.620117   586.340027


###5. Output covariance and correlation matrices

In [184]:
print(stock_prices.cov())
print(stock_prices.corr())

                TSLA          AAPL           AMZN          GOOGL           NFLX
TSLA    28467.650870   4314.330727  100627.026597   38715.315103   15128.697805
AAPL     4314.330727    830.780429   21190.722289    7855.577163    3360.687836
AMZN   100627.026597  21190.722289  602436.125206  211304.009446  100494.451660
GOOGL   38715.315103   7855.577163  211304.009446   82701.491156   35426.739287
NFLX    15128.697805   3360.687836  100494.451660   35426.739287   17951.230657
           TSLA      AAPL      AMZN     GOOGL      NFLX
TSLA   1.000000  0.887145  0.768393  0.797903  0.669235
AAPL   0.887145  1.000000  0.947212  0.947716  0.870238
AMZN   0.768393  0.947212  1.000000  0.946663  0.966361
GOOGL  0.797903  0.947716  0.946663  1.000000  0.919448
NFLX   0.669235  0.870238  0.966361  0.919448  1.000000


###6. Linear Regression

In [185]:
from sklearn.linear_model import LinearRegression

tsla = stock_prices['TSLA']
length = len(tsla)

model = LinearRegression().fit(np.arange(length).reshape(length,1), tsla)

print(model.predict([[i] for i in range(length, length+5)]))

[306.39006341 306.67862468 306.96718594 307.25574721 307.54430848]


###7. Log percentage difference

In [186]:
import copy

log_diff = copy.deepcopy(stock_prices)
log_diff.loc[:, 'TSLA':] = stock_prices.loc[:, 'TSLA':].apply(np.log).diff()
print(log_diff)

           Date      TSLA      AAPL      AMZN     GOOGL      NFLX
0     2/24/2016       NaN       NaN       NaN       NaN       NaN
1     2/25/2016  0.046020  0.006844  0.002002  0.011338  0.031377
2     2/26/2016  0.015407  0.001549  0.000144 -0.005860  0.002747
3     2/29/2016  0.008319 -0.002273 -0.004893 -0.010596 -0.014665
4      3/1/2016 -0.029504  0.038946  0.046882  0.034196  0.051026
...         ...       ...       ...       ...       ...       ...
1254  2/17/2021  0.002421 -0.017802  0.012068  0.003745 -0.010716
1255  2/18/2021 -0.013586 -0.008674  0.005903 -0.006065 -0.005675
1256  2/19/2021 -0.007752  0.001233 -0.023816 -0.008106 -0.014700
1257  2/22/2021 -0.089376 -0.030252 -0.021510 -0.016679 -0.011993
1258  2/23/2021 -0.022161 -0.001112  0.004317  0.002849  0.022910

[1259 rows x 6 columns]
