# Chapter 1

### Matplotlib Theme

```
# Change style to ggplot
plt.style.use("ggplot")
# Change style back to default
plt.style.use("default")
# See available themes
print(plt.style.available)
```

### Time Series Graph

```
import pandas as pd
df = pd.read_csv('filename.csv', parse_dates=["date_col"], index_col="date_col")
df = df["1960-01-01":"1969-12-31"] # Subsetting for smaller portion

sub_df = df['1960':'1970'] # Slicing using time index
ax = sub_df.plot(color='blue', figsize=(12, 5), fontsize=12, linewidth=3, linestyle='--')
ax.set_xlabel('X label')
ax.set_ylabel('Y label')
ax.axvline('1969-01-01', color='red', linestyle='--') # Vertical Line, uses time index
ax.axhline(4, color='green', linestyle='--') # Horizontal Line, uses y value
ax.axvspan('1964-01-01', '1968-01-01', color='red', alpha=0.3) # Vertical region, uses time index range
ax.axhspan(8, 6, color='green', alpha=0.3)  # Horizontal region, uses y value range

df_summary = sub_df.describe()
# Attach information table with the plot
ax.table(cellText=df_summary.values,
        colWidths=[0.3]*len(df.columns), # Specify width of the table
        rowLabels=df_summary.index, # Specify row labels
        colLabels=df_summary.columns, # Specify column labels
        loc='top') # Specify location of the table in the plot

rolling_mean = df.rolling(window=52).mean()
# Other types of plots
df.plot(kind='hist', bins=100) # other types include 'kde', 'box' etc

# Auto-correlation
from statsmodels.graphics import tsaplots
fig = tsaplots.plot_acf(df['col'], lags=40)
# Partial Auto-correlation
from statsmodels.graphics import tsaplots
fig = tsaplots.plot_pacf(co2_levels['co2'], lags=40)

# Multiple time series on same plot
fig, ax = plt.subplots()
ax.plot(df.index, df['col1'], color='red')
ax.tick_params('y', colors='red')
ax.set_xlabel('Time Axis')
ax.set_ylabel('First Y Axis')

ax2 = ax.twinx() # Same x axis, but separate y axis
ax2.plot(df.index, df['col2'], color='blue')
ax2.tick_params('y', colors='blue')
ax2.set_ylabel('Second Y Axis')

ax2.annotate("Annotation text", xy=(pd.Timestamp('2015-10-06'), 1),
        xytext=(pd.Timestamp('2008-10-06'), -0.2),
        arrowprops={"arrowstyle":"->", "color":"gray"})

plt.show()

```

# Chapter 2

### Missing values

```
# Check missing data
df.isna().any()
df.isna().sum()
# Visualize missing data information
import missingno as msno
import matplotlib.pyplot as plt
msno.matrix(df)
plt.show()

# Drop missing data column
df_dropped = df.dropna(subset = ['col'], axis = 1) # 0 for row
df.dropna(axis = 0) # Drop entire row for missing value (default)
df.dropna(axis = 1) # Drop entire column for missing value

# Replace/impute missing data with single value
col_mean = df['col'].mean()
df_imputed = df.fillna({'col': col_mean})
df.fillna(method='bfill')
df.interpolate(method='linear')
df['col'].replace(to_replace=np.nan, value = some_mean,inplace = True) # Alternative
# Replace/impute missing data with series
series_imp = df['col1'] * 5
df_imputed = df.fillna({'col2':series_imp})

df["col"].value_counts() # Look out for suspicious values

##### Strategic dropping example ########
# Drop missing values where <= 5% of data in column are missing , otherwise impute values
threshold = len(df) * 0.05
cols_to_drop = df.columns[df.isna().sum() <= threshold]
df.dropna(subset=cols_to_drop, inplace=True)
cols_with_missing_values = df.columns[salaries.isna().sum() > 0]
for col in cols_with_missing_values[:-1]:
    df[col].fillna(df[col].mode()[0])
subgroup_dict = df.groupby("cat_col")["num_col"].median().to_dict()
df["num_col"] = df["num_col"].fillna(df["cat_col"].map(subgroup_dict))
```

# Chapter 3

### Autocorrelation and partial autocorrelation

- Autocorrelation:
- Lets consider a series 1,2,3,4,5,6
- There is a correlation between every consecutive elements:
    - second number = first number + 1 (correlation between first and second number)
    - third number = second number + 1 (correlation between second and third number)
- This consecutive correlation goes on for every n-th element with its previous element and evaluates the same correlation value
- We can say this is kind of an echo that is embedded within every n-th element of the sequence
- We call this autocorrelation.
- So we say,  Autocorrelation is the Correlation of a series with a lagged copy of itself
- We need it while working with time series.
- Negative autocorrelation : Mean Reversion
- Positive autocorrelation : Momentum, or Trend Following 

- Partial Autocorrelation:
- Shows incremental benefit of adding nth lag when n-1 lags are already present
- Removes the effects of previous lags (cumulative nth - cumulative n-1th)
- eg: a partial autocorrelation function of order 3 returns the correlation between our time series  t1 and lagged values of itself by 3 time points  t4 but only after removing all effects attributing to lags 1 and 2

# Chapter 4

### Heatmap

```
# Drop highly correlated features if you are confident that they may add bias into the model
corr_df = df.corr(method='pearson') # could also be 'spearman' or 'kendall'
mask = np.triu(np.ones_like(corr_df, dtype=bool))
sns.heatmap(corr_df, mask=mask, center=0, linewidths=1, annot=True, fmt=".2f")
# Alternative approach
corr_df = df.corr().abs()
mask = np.triu(np.ones_like(corr_df, dtype=bool))
tri_df = corr_df.mask(mask)
to_drop = [c for c in tri_df.columns if any(tri_df[c] > 0.95)]
reduced_df = chest_df.drop(to_drop, axis=1)
# Different concept : Hierarchical heatmap (similar columns are put closely together)
sns.clustermap(corr_df)
```

# Chapter 5

### Time series decomposition

```
# Time series decomposition
import statsmodels.api as sm
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 11, 9
decomposition = sm.tsa.seasonal_decompose(df['col'])
fig = decomposition.plot()
plt.show()
print(dir(decomposition)) # See what can be used
decomp_seasonal = decomposition.seasonal # Seasonal component
decomp_trend = decomposition.trend # Trend component
decomp_resid = decomp.resid # Residual component
```

### Subplot python

```
fig, axs = plt.subplots(2, 2)
axs[0, 0].plot(x, y)
axs[0, 0].set_title("main")
axs[1, 0].plot(x, y**2)
axs[1, 0].set_title("shares x with main")
axs[1, 0].sharex(axs[0, 0])
axs[0, 1].plot(x + 1, y + 1)
axs[0, 1].set_title("unrelated")
axs[1, 1].plot(x + 2, y + 2)
axs[1, 1].set_title("also unrelated")
fig.tight_layout()

# Alternative way
# Create a facetted graph with 2 rows and 2 columns
ax = df.plot(subplots=True,
                      layout=(2,2),
                      sharex=False,
                      sharey=False,
                      linewidth=0.7,
                      fontsize=3,
                      legend=False)

plt.show()
```