In [None]:
# Python tricks 

In [None]:
# Tags
#sketch, pd.cut()

# References
[Tips](https://medium.com/@Divithraju/six-amazing-unknown-python-libraries-c7bdad6b4472)
[Tips](https://pub.towardsai.net/six-amazing-python-libraries-that-im-using-now-cbcf4f4ddb79)
[Python for data enginieers](https://towardsdatascience.com/python-for-data-engineers-f3d5db59b6dd)

In [None]:
# imports
import os
import time
import random

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from numpy.random import normal
from numpy import hstack
from statsmodels.distributions.empirical_distribution import ECDF
import sketch

# utils
from utils import tab_data

# settings
import warnings
warnings.filterwarnings('ignore')
#print all rows of a df in ipython shell 
pd.set_option('display.max_rows', None)
#print all columns of a df in ipython shell 
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option("display.precision", 6)

## TOC:
* [sketch](#sketch)
* [List_comprehensions_for_json_data](#List_comprehensions_for_json_data)
* [Decorators](#Decorators)
* [Working_with_APIs](#Working_with_APIs)
* [maps](#maps)
* [filters](#filters)
* [Process_large_datasets_using_Pandas](#Process_large_datasets_using_Pandas)
* [binning](#binning)

## sketch <a class="anchor" id="sketch"></a>

In [None]:
# sketch - AI tool for eda

In [None]:
path = os.path.abspath(os.getcwd())
my_file = 'my_data.csv'
input_path = os.path.join(path, 'data', my_file)

df = pd.read_csv(input_path)

In [None]:
df.columns

In [None]:
df.shape

In [None]:
# Asking which columns are category type
df.sketch.ask("Which columns are categorical type?")

In [None]:
# To find the shape of the dataframe
df.sketch.ask("What is the shape of the dataframe")

In [None]:
# Asking to provide code snipped for visualising the emotions
df.sketch.howto("Visualize the Actors")

In [None]:
# Import libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Create a list of actors from the df
actors = df['Actors'].tolist()

# Create a list of unique actors from the list of actors
unique_actors = list(set(actors))

# Create a dictionary with actor name as key and count as value 
actor_count = {actor: actors.count(actor) for actor in unique_actors}

# Plot the actor count using seaborn barplot
sns.barplot(x=list(actor_count.keys()), y=list(actor_count.values()))
plt.xticks(rotation=90)
plt.show()

In [None]:
# Asking to provide code snipped for visualising the emotions
df.sketch.howto("List the unique Actors")

In [None]:
# Get the unique values of the Actors column
unique_actors = df['Actors'].unique()

# Print the list of unique actors
print(unique_actors)

## List_comprehensions_for_json_data <a class="anchor" id="List_comprehensions_for_json_data"></a>

In [None]:
# List comprehensions for json data

In [None]:
import io
import json

def etl(item):
    return json.dumps(item)

# Text file loaded as a blob
blob = """
        [
{"id":"1","first_name":"John"},
{"id":"2","first_name":"Mary"}
]
"""
json_data = json.loads(blob)
data_str = u"\n".join(etl(item) for item in json_data)

print(data_str)
data_file = io.BytesIO(data_str.encode())

# This data file is ready for BigQuery as Newline delimited JSON
print(data_file)

In [None]:
## Decorators <a class="anchor" id="Decorators"></a>

In [None]:
# Decorators

In [None]:
def etl_decorator(func):
    def wrapper():
        result = func()
        return f'Processing again {result}' 
    return wrapper

@etl_decorator
def unzip_data():
    return "unzipped data"

@etl_decorator
def zip_data():
    return "zipped data"

print(unzip_data())  # Output: Processing unzipped data
print(zip_data())  # Output: Processing unzipped data

## Working_with_APIs <a class="anchor" id="Working_with_APIs"></a>

In [None]:
# Working with APIs

In [None]:
import requests
session = requests.Session()

url="https://api.nasa.gov/neo/rest/v1/feed"
apiKey="your_api_key"
requestParams = {
    'api_key': apiKey,
    'start_date': '2023-04-20',
    'end_date': '2023-04-21'
}
response = session.get(url, params = requestParams, stream=True)
print(response.status_code)

## maps <a class="anchor" id="maps"></a>

In [None]:
# map

In [None]:
# transforms data line by line aplying map function to items in the dataset prrocessing it as an iterable

import math 
numbers = [10,20]
factorials = list(map(lambda i: math.factorial(int(math.sqrt(i**3))), numbers))
print(factorials)

## filters <a class="anchor" id="filters"></a>

In [None]:
# filter

In [None]:
# filter to extract objects matching a certain criteria

numbers = [10,21, 43, 88, 40]
even_numbers = list(filter(lambda i: i% 2 == 0, numbers))
print(even_numbers)

## Process_large_datasets_using_Pandas <a class="anchor" id="Process_large_datasets_using_Pandas"></a>

In [None]:
# Process large datasets using Pandas

In [None]:
# Example not running code 1
batchsize = 10 ** 5
with pd.read_csv(filename, chunksize=batchsize) as reader:
    for batch in reader:
        etl(batch)

In [None]:
# Example not running code 2
batch_data=pd.read_table('recommendation_data.csv',chunksize=100000,sep=';',\
       names=['group','user','rating','date','id'],index_col='id',\
       header=None,parse_dates=['date'])

df=pd.DataFrame()
%time df=pd.concat(batch.groupby(['group','user',batch['date'].map(lambda x: x.year)])['rating'].agg(['sum']) for batch in batch_data)

## binning <a class="anchor" id="binning"></a>

In [None]:
# binning using cut

import time
import random

data = {
    'Employee_ID': range(1, 1_000_001),
    'Performance_Score': [random.randint(0, 100) for _ in range(1_000_000)]
}
df = pd.DataFrame(data)

begin = time.time() 

bins = [0, 69, 89, 100]
labels = ['Needs Improvement', 'Good', 'Excellent']
df['Performance_Category'] = pd.cut(df['Performance_Score'], bins=bins, labels=labels)
end = time.time() 
print(f"Total runtime of the program is {end - begin}") 
# Total runtime of the program is 0.052858829498291016