# Setup and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.api as sm
import seaborn as sns

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 30)

In [None]:
# https://www.kaggle.com/datasets/kabhishm/global-spotify-weekly-chart?resource=download
df = pd.read_csv("spotify_weekly_chart.csv")

# Data Inspection

In [None]:
# check out first 10 rows
df.head(10)

In [None]:
# get dimensions
df.shape

In [None]:
# number of entries
df.size

In [None]:
# summarize values
df.describe()

In [None]:
# detailed summary
df.info()

In [None]:
# check for any missing values
df.isna().sum()

In [None]:
# fill in missing values with something sensible
df['(x?)'] = df['(x?)'].fillna('(x0)')

In [None]:
print(df.index)
print(df.columns)

# Common Functions

## Indexing

In [None]:
df.head()

In [None]:
df = df.set_index('Pos')

In [None]:
df.index

## Element Access

In [None]:
# get a specific column, output is a series
df["Title"]

In [None]:
# get several specific columns
df[['Artist', 'Title', 'Streams']]

# df['Artist', 'Title']          # this is an error, tries to find a column named with the tuple ('Artist', 'Title')

In [None]:
# find all songs that peaked in the top 3
df[df['Pk'] < 3]

In [None]:
# access values by index and column label
df.loc[150, 'Title']

In [None]:
# access entire data row by index
df.loc[150]

In [None]:
# access entire column by column label
df.loc[:, 'Title']

In [None]:
# Do the same things but just by row number and column number
df.iloc[100, 2]

# Sorting and value counts

In [None]:
# sort rows by the most number of weeks on the charts, ties broken by peak
df.sort_values(by=["Wks", "Pk"], axis=0, ascending=False)

In [None]:
df['Artist'].value_counts()

## Aggregations

Don't compute these statistics "manually" using for loops.
The library functions are vectorized and will run faster.

In [None]:
df['Streams'].mean()

In [None]:
df['Streams'].median()

In [None]:
df['Streams'].max()

In [None]:
# standard deviation
df['Streams'].std()

# Examples


Task: Find the artists with most appearances on the list and sort the dataset in decreasing order of number of appearances

In [None]:
most_app_idx = df['Artist'].value_counts().index

In [None]:
l = [False] * (df.shape[0] - 1) + [True]

In [None]:
df.reset_index().set_index('Artist').loc[most_app_idx]

Task: Convert the (x?) column into integers

In [None]:
df.dtypes

In [None]:
df['(x?)'] = df['(x?)'].apply(lambda s : s[2:-1])

In [None]:
df['(x?)'] = df['(x?)'].astype('int64')

In [None]:
df.dtypes

In [None]:
df.head()

# Visualization

In [None]:
plt.figure(figsize=(16,8))


plt.scatter(df.index, 1 / df['Streams'])
plt.title("Inverse of # Streams vs. Position")
plt.xlabel("Position")
plt.ylabel("Inverse of # Streams")

In [None]:
plt.figure(figsize=(16,8))

plt.hist(df["Wks"], bins=20)

plt.title("Distribution of # of Weeks on Charts")
plt.xlabel("# of Weeks")
plt.ylabel("Count")

In [None]:
# https://www.kaggle.com/datasets/achintyatripathi/yahoo-finance-apple-inc-aapl?select=AAPL_daily_update.csv
aapl_df = pd.read_csv("AAPL_daily_update.csv")

In [None]:
aapl_df.head()

In [None]:
plt.figure(figsize=(16,8))

plt.plot(aapl_df.Date, (aapl_df.Open + aapl_df.Close) / 2, color="green", alpha=0.5)

plt.title("AAPL Price over Time")
plt.xlabel("Date")
plt.ylabel("Average Price (avg of open and close)")

# make x tick labels vertical
plt.xticks(rotation=90)

# don't show every date on x-axis
ax = plt.gca()

every_nth = 10
for n, label in enumerate(ax.xaxis.get_ticklabels()):
    if n % every_nth != 0:
        label.set_visible(False)



# Modeling

In [None]:
df.head()

Hypothesis: # Wks spent on charts is positively correlated to total streams

In [None]:
plt.figure(figsize=(16,8))

plt.scatter(df.Wks, df.Total)

In [None]:
regression_df = df[df.Wks > 10]

In [None]:
regression_df.shape

In [None]:
# sm.OLS(y, X)
# there is no intercept term by default
model = sm.OLS(regression_df.Total, regression_df.Wks)

result = model.fit()

result.summary()

In [None]:
plt.figure(figsize=(16,8))

# unfortunately this has intercept, and no easy way to adjust this
sns.regplot(data=regression_df, x="Wks", y="Total", fit_reg=True)

# Exercises

In [None]:
# loading iris dataset
from sklearn import datasets

iris = datasets.load_iris()
print(iris["data"][0:10, :], iris["target"], iris["feature_names"])

targets = np.reshape(iris["target"], newshape=(len(iris["data"]), 1))
new_data = np.concatenate((iris["data"], targets), axis=1)
iris = pd.DataFrame(data=new_data, columns=iris["feature_names"] + ["target"])
print(iris)

In [None]:
# Exercise 1:
# Use some one line functions to answer these questions:
# How big is the dataset?
# What information is in the dataset?
# How do we extract columns from the dataset?
# Output the first few enries of the dataset. Print the last few entries of the dataset.


In [None]:
# Exercise 2:
# Output 5 flowers with the largest sepal width (cm)

In [None]:
# Exercise 3:
# Create a new column named sepal ratio: which is the ratio sepal width / sepal length
# Do the same for petal ratio

In [None]:
# Exercise 4:
# Graph petal ratio vs sepal ratio