In [1]:
import glob
import itertools
import json
import os
import pickle
import random
import re
import statistics
import geopandas as gpd
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import requests
import scipy as sp
import seaborn as sns
from dotenv import load_dotenv
from pandas_profiling import ProfileReport

%matplotlib inline
import matplotlib as mpl
import matplotlib.font_manager
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from matplotlib.ticker import PercentFormatter

# dot env for secrets
load_dotenv()
some_apikey = os.getenv("SOME_KEY")

# mapbox
TOKEN = os.getenv("MAPBOX_TOKEN")
px.set_mapbox_access_token(TOKEN)
MAPBOX_STYLE = "dark"
MAPBOX_HEIGHT = 800

# matplotlib configs
matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext="ttf")
plt.style.use("seaborn-colorblind")
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = "Open Sans"
rcParams["figure.figsize"] = 15, 6

# watermark
%reload_ext watermark
%watermark -a 'Ken Cavagnolo' -n -u -v -m -h -g -p jupyter,notebook,pandas,numpy,scipy

Author: Ken Cavagnolo

Last updated: Fri Aug 06 2021

Python implementation: CPython
Python version       : 3.8.0
IPython version      : 7.25.0

jupyter : 1.0.0
notebook: 6.4.0
pandas  : 1.3.0
numpy   : 1.21.0
scipy   : 1.7.0

Compiler    : GCC 10.3.0
OS          : Linux
Release     : 5.11.0-7620-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 12
Architecture: 64bit

Hostname: goldfinch

Git hash: 7886eb4c04fef3377c90fd2a167e9c0a6e0e89fb



In [None]:
# load data
df = pd.read_csv()

In [None]:
# inspect
profile = ProfileReport(df)
profile

In [None]:
# clean column names
expression = "[^A-Za-z0-9]+"
new_col_names = [re.sub(expression, "_", x).lower() for x in df.columns]
df.columns = new_col_names

In [None]:
# clean dates
df['date'] = pd.to_datetime(df['date'])

In [None]:
# drop complete duplicate
df.drop_duplicates(inplace=True)

In [None]:
# replace dupes by some agg func
dupe_cols = ['col_a', "col_b"]
replacements = {'col_c': "min", 'col_d': "mean"}
df = df.groupby(by=dupe_cols).agg(replacements).reset_index()

In [None]:
# check for dupes
duplicates = df.duplicated(subset=dupe_cols, keep=False)
assert df[duplicates == True].shape[0] == 0

In [None]:
# check categories and membership
set(df.col_e) ^ set(df.col_f)

In [None]:
# named categorical bins
bins = [0, 60, 180, np.inf]
labels = ['short', 'medium', 'long']
df['col_g_binned'] = pd.cut(df["col_g"], bins=bins, labels=labels)

# Create mappings and replace
mappings = {
    'Monday': 'weekday',
    'Tuesday': 'weekday',
    'Wednesday': 'weekday',
    'Thursday': 'weekday',
    'Friday': 'weekday',
    'Saturday': 'weekend',
    'Sunday': 'weekend'
}
df['dow'] = df['day'].replace(mappings)

In [None]:
# check strings for consistency
check_cols = ["col_h", "col_j"]
for col in check_cols:
    sanity = df[col].str.len()
    assert sanity.min() >= 10
    assert df[col].str.contains(set(punctuation)).any() == False