# Chapter 1

### Matplotlib Theme

```
# Change style to ggplot
plt.style.use("ggplot")
# Change style back to default
plt.style.use("default")
# See available themes
print(plt.style.available)
```

### Time Series Graph

```
sub_df = df['1960':'1970'] # Slicing using time index
ax = sub_df.plot(color='blue', figsize=(12, 5), fontsize=12, linewidth=3, linestyle='--')
ax.set_xlabel('X label')
ax.set_ylabel('Y label')
ax.axvline('1969-01-01', color='red', linestyle='--') # Vertical Line, uses time index
ax.axhline(4, color='green', linestyle='--') # Horizontal Line, uses y value
ax.axvspan('1964-01-01', '1968-01-01', color='red', alpha=0.3) # Vertical region, uses time index range
ax.axhspan(8, 6, color='green', alpha=0.3)  # Horizontal region, uses y value range

rolling_mean = df.rolling(window=52).mean()
# Other types of plots
df.plot(kind='hist', bins=100) # other types include 'kde', 'box' etc
```

# Chapter 2

### Missing values

```
# Check missing data
df.isna().any()
df.isna().sum()
# Visualize missing data information
import missingno as msno
import matplotlib.pyplot as plt
msno.matrix(df)
plt.show()

# Drop missing data column
df_dropped = df.dropna(subset = ['col'], axis = 1) # 0 for row
df.dropna(axis = 0) # Drop entire row for missing value (default)
df.dropna(axis = 1) # Drop entire column for missing value

# Replace/impute missing data with single value
col_mean = df['col'].mean()
df_imputed = df.fillna({'col': col_mean})
df.fillna(method='bfill')
df.interpolate(method='linear')
df['col'].replace(to_replace=np.nan, value = some_mean,inplace = True) # Alternative
# Replace/impute missing data with series
series_imp = df['col1'] * 5
df_imputed = df.fillna({'col2':series_imp})

df["col"].value_counts() # Look out for suspicious values

##### Strategic dropping example ########
# Drop missing values where <= 5% of data in column are missing , otherwise impute values
threshold = len(df) * 0.05
cols_to_drop = df.columns[df.isna().sum() <= threshold]
df.dropna(subset=cols_to_drop, inplace=True)
cols_with_missing_values = df.columns[salaries.isna().sum() > 0]
for col in cols_with_missing_values[:-1]:
    df[col].fillna(df[col].mode()[0])
subgroup_dict = df.groupby("cat_col")["num_col"].median().to_dict()
df["num_col"] = df["num_col"].fillna(df["cat_col"].map(subgroup_dict))
```