https://towardsdatascience.com/pandas-and-python-tips-and-tricks-for-data-science-and-data-analysis-1b1e05b7d93a

## New Columns from Multiple Columns

In [4]:
import pandas as pd

# Create the dataframe
candidates= {
    'Name':["Aida","Mamadou","Ismael","Aicha","Fatou", "Khalil"],
    'Degree':['Master','Master','Bachelor', "PhD", "Master", "PhD"],
    'From':["Abidjan","Dakar","Bamako", "Abidjan","Konakry", "Lomé"],
    'Years_exp': [2, 3, 0, 5, 4, 3],
    'From_office(min)': [120, 95, 75, 80, 100, 34]
          }
candidates_df = pd.DataFrame(candidates)

"""
----------------My custom function-------------------
""" 
def candidate_info(row):

  # Select columns of interest 
  name = row.Name 
  is_from = row.From
  year_exp = row.Years_exp
  degree = row.Degree
  from_office = row["From_office(min)"]

  # Generate the description from previous variables
  info = f"""{name} from {is_from} holds a {degree} degree 
              with {year_exp} year(s) experience 
              and lives {from_office} from the office"""

  return info

"""
-------Application of the function to the data ------
"""
candidates_df["Description"] = candidates_df.apply(lambda row: candidate_info(row), axis=1)
display(candidates_df)

Unnamed: 0,Name,Degree,From,Years_exp,From_office(min),Description
0,Aida,Master,Abidjan,2,120,Aida from Abidjan holds a Master degree \n ...
1,Mamadou,Master,Dakar,3,95,Mamadou from Dakar holds a Master degree \n ...
2,Ismael,Bachelor,Bamako,0,75,Ismael from Bamako holds a Bachelor degree \n ...
3,Aicha,PhD,Abidjan,5,80,Aicha from Abidjan holds a PhD degree \n ...
4,Fatou,Master,Konakry,4,100,Fatou from Konakry holds a Master degree \n ...
5,Khalil,PhD,Lomé,3,34,Khalil from Lomé holds a PhD degree \n ...


## Convert categorical data into numerical ones

In [5]:
seniority = ['Entry level', 'Mid level', 'Senior level']
seniority_bins = [0, 1, 3, 5]
candidates_df['Seniority'] = pd.cut(candidates_df['Years_exp'],
                                    bins=seniority_bins, 
                                    labels=seniority, 
                                    include_lowest=True)

candidates_df

Unnamed: 0,Name,Degree,From,Years_exp,From_office(min),Description,Seniority
0,Aida,Master,Abidjan,2,120,Aida from Abidjan holds a Master degree \n ...,Mid level
1,Mamadou,Master,Dakar,3,95,Mamadou from Dakar holds a Master degree \n ...,Mid level
2,Ismael,Bachelor,Bamako,0,75,Ismael from Bamako holds a Bachelor degree \n ...,Entry level
3,Aicha,PhD,Abidjan,5,80,Aicha from Abidjan holds a PhD degree \n ...,Senior level
4,Fatou,Master,Konakry,4,100,Fatou from Konakry holds a Master degree \n ...,Senior level
5,Khalil,PhD,Lomé,3,34,Khalil from Lomé holds a PhD degree \n ...,Mid level


## Equal Size Bins

In [6]:
commute_time_labels = ["good", "acceptable", "too long"]
candidates_df["Commute_level"] = pd.qcut(
                                candidates_df["From_office(min)"],
                                q = 3, 
                                labels=commute_time_labels
                              )
candidates_df

Unnamed: 0,Name,Degree,From,Years_exp,From_office(min),Description,Seniority,Commute_level
0,Aida,Master,Abidjan,2,120,Aida from Abidjan holds a Master degree \n ...,Mid level,too long
1,Mamadou,Master,Dakar,3,95,Mamadou from Dakar holds a Master degree \n ...,Mid level,acceptable
2,Ismael,Bachelor,Bamako,0,75,Ismael from Bamako holds a Bachelor degree \n ...,Entry level,good
3,Aicha,PhD,Abidjan,5,80,Aicha from Abidjan holds a PhD degree \n ...,Senior level,acceptable
4,Fatou,Master,Konakry,4,100,Fatou from Konakry holds a Master degree \n ...,Senior level,too long
5,Khalil,PhD,Lomé,3,34,Khalil from Lomé holds a PhD degree \n ...,Mid level,good


## Select rows from a Pandas Dataframe based on column(s) values

In [9]:
# Get all the candidates with a Master degree
ms_candidates = candidates_df.query("Degree == 'Master'")
display(ms_candidates)

# Get non bachelor candidates
no_bs_candidates = candidates_df.query("Degree != 'Bachelor'")
display(no_bs_candidates)
# Get values from list
list_locations = ["Abidjan", "Dakar"]
candidates = candidates_df.query("From in @list_locations")
display(candidates)

Unnamed: 0,Name,Degree,From,Years_exp,From_office(min),Description,Seniority,Commute_level
0,Aida,Master,Abidjan,2,120,Aida from Abidjan holds a Master degree \n ...,Mid level,too long
1,Mamadou,Master,Dakar,3,95,Mamadou from Dakar holds a Master degree \n ...,Mid level,acceptable
4,Fatou,Master,Konakry,4,100,Fatou from Konakry holds a Master degree \n ...,Senior level,too long


Unnamed: 0,Name,Degree,From,Years_exp,From_office(min),Description,Seniority,Commute_level
0,Aida,Master,Abidjan,2,120,Aida from Abidjan holds a Master degree \n ...,Mid level,too long
1,Mamadou,Master,Dakar,3,95,Mamadou from Dakar holds a Master degree \n ...,Mid level,acceptable
3,Aicha,PhD,Abidjan,5,80,Aicha from Abidjan holds a PhD degree \n ...,Senior level,acceptable
4,Fatou,Master,Konakry,4,100,Fatou from Konakry holds a Master degree \n ...,Senior level,too long
5,Khalil,PhD,Lomé,3,34,Khalil from Lomé holds a PhD degree \n ...,Mid level,good


Unnamed: 0,Name,Degree,From,Years_exp,From_office(min),Description,Seniority,Commute_level
0,Aida,Master,Abidjan,2,120,Aida from Abidjan holds a Master degree \n ...,Mid level,too long
1,Mamadou,Master,Dakar,3,95,Mamadou from Dakar holds a Master degree \n ...,Mid level,acceptable
3,Aicha,PhD,Abidjan,5,80,Aicha from Abidjan holds a PhD degree \n ...,Senior level,acceptable


## Deal with zip files (no files)

In [10]:
import pandas as pd

"""
------------ READ ZIP FILES -----------
"""
# Case 1: read a single zip file 
candidate_df_unzip = pd.read_csv('candidates.csv.zip', compression='zip')

# Case 2: read a file from a folder
from zipfile import ZipFile

# Read the file from a zip folder
sales_df = pd.read_csv(ZipFile("data.zip").open('data/sales_df.csv'))


"""
------------ WRITE ZIP FILES -----------
"""
# Read data from internet
url = "https://raw.githubusercontent.com/keitazoumana/Fastapi-tutorial/master/data/spam.csv"
spam_data = pd.read_csv(url, encoding="ISO-8859-1")

# Save it as a zip file
spam_data.to_csv("spam.csv.zip", compression="zip")

# Check the files sizes
from os import path
path.getsize('spam.csv') / path.getsize('spam.csv.zip')

FileNotFoundError: [Errno 2] No such file or directory: 'candidates.csv.zip'

##  Select 𝗮 𝘀𝘂𝗯𝘀𝗲𝘁 𝗼𝗳 𝘆𝗼𝘂𝗿 𝗣𝗮𝗻𝗱𝗮𝘀 𝗱𝗮𝘁𝗮𝗳𝗿𝗮𝗺𝗲 𝘄𝗶𝘁𝗵 𝘀𝗽𝗲𝗰𝗶𝗳𝗶𝗰 𝗰𝗼𝗹𝘂𝗺𝗻 𝘁𝘆𝗽𝗲𝘀 (file missing)

In [11]:
# Import pandas library
import pandas as pd

# Read my dataset
candidates_df = pd.read_csv("./data/candidates_data.csv")

# Check the data columns' types
candidates_df.dtypes

# Only select columns of type "object" & "datetime"
candidates_df.select_dtypes(include = ["object", "datetime64"])

# Exclude columns of type "datetime" & "int"
candidates_df.select_dtypes(exclude = ["int64", "datetime64"])

FileNotFoundError: [Errno 2] No such file or directory: './data/candidates_data.csv'

## Remove comments from Pandas dataframe column

In [None]:
# Import pandas library
import pandas as pd

# Read my messy dataset
messy_df = pd.read_csv("./data/candidates_data.csv")

# FIRST SCENARIO -> REMOVE COMMENTS
clean_df = pd.read_csv("./data/candidates_data.csv", comment='#')

# SECOND SCENARIO -> CREATE NEW COLUMN FOR COMMENTS
messy_df[['application_date', 'comment']] = messy_df['application_date'].str.split('#', 1, expand=True)

## Print Pandas dataframe in Tabular format from consol

In [12]:
# Import pandas library
import pandas as pd

data_URL = "https://raw.githubusercontent.com/keitazoumana/Experimentation-Data/main/vgsales.csv" 

# Read your dataframe
video_game_data = pd.read_csv(data_URL)

"""
Printing without to_string() function
"""
print(video_game_data.head())

"""
Printing with to_string() function
"""
print(video_game_data.head().to_string())

   Rank                      Name Platform    Year         Genre Publisher  \
0     1                Wii Sports      Wii  2006.0        Sports  Nintendo   
1     2         Super Mario Bros.      NES  1985.0      Platform  Nintendo   
2     3            Mario Kart Wii      Wii  2008.0        Racing  Nintendo   
3     4         Wii Sports Resort      Wii  2009.0        Sports  Nintendo   
4     5  Pokemon Red/Pokemon Blue       GB  1996.0  Role-Playing  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  
0     41.49     29.02      3.77         8.46         82.74  
1     29.08      3.58      6.81         0.77         40.24  
2     15.85     12.88      3.79         3.31         35.82  
3     15.75     11.01      3.28         2.96         33.00  
4     11.27      8.89     10.22         1.00         31.37  
   Rank                      Name Platform    Year         Genre Publisher  NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales
0     1                Wii Sports

## Highlight data points in Pandas

In [13]:
import pandas as pd

my_info = {
    "Salary": [100000.2, 95000.9, 103000.2, 65984.1, 150987.08], 
    "Height": [6.5, 5.2, 5.59, 6.7, 6.92], 
    "weight": [185.23, 105.12, 110.3, 190.12, 200.59]      
}
my_data = pd.DataFrame(my_info)

"""
Function to highlight min and max
"""

def highlight_min_max(data_frame, min_color, max_color):

  # This first line create a styler object
  final_data = data_frame.style.highlight_max(color = max_color)

  # On this second line, no need to use ".style"
  final_data = final_data.highlight_min(color = min_color)

  return final_data
  
# Function to apply ORANGE to min and GREEN to max
highlight_min_max(my_data, min_color='orange', max_color='green')


"""
Custom function: apply RED or GREEN whether data is below or above the mean. 
"""
def highlight_values(data_row):
  low_value_color = "background-color:#C4606B  ; color: white;"
  high_value_color = "background-color: #C4DE6B; color: white;"   
  filter = data_row < data_row.mean()

  return [low_value_color if low_value else high_value_color for low_value in filter]
  
# Application of my custom function to only 'Height' & 'weight'
my_data.style.apply(highlight_values, subset=['Height', 'weight'])

Unnamed: 0,Salary,Height,weight
0,100000.2,6.5,185.23
1,95000.9,5.2,105.12
2,103000.2,5.59,110.3
3,65984.1,6.7,190.12
4,150987.08,6.92,200.59


## Reduce decimal points in your data

In [14]:
long_decimals_info = {
    "Salary": [100000.23400000, 95000.900300, 103000.2300535, 65984.14000450, 150987.080345], 
    "Height": [6.501050, 5.270000, 5.5900001050, 6.730001050, 6.92100050], 
    "weight": [185.23000059, 105.1200099, 110.350003, 190.12000000, 200.59000000]      
}

long_decimals_df = pd.DataFrame(long_decimals_info)

"""
Format the data with 2 decimal places
"""
fewer_decimals_df = long_decimals_df.round(decimals=2)
fewer_decimals_df

Unnamed: 0,Salary,Height,weight
0,100000.23,6.5,185.23
1,95000.9,5.27,105.12
2,103000.23,5.59,110.35
3,65984.14,6.73,190.12
4,150987.08,6.92,200.59


## Replace some values in your data frame

In [16]:
import pandas as pd
import numpy as np

candidates_info = {
    'Full_Name':["Aida Kone","Mamadou Diop","Ismael Camara","Aicha Konate",
                 "Fanta Koumare", "Khalil Cisse"],
    'degree':['Master','MS','Bachelor', "PhD", "Masters", np.nan],
    'From':[np.nan,"Dakar","Bamako", "Abidjan","Konakry", "Lomé"],
    'Age':[23,26,19, np.nan,25, np.nan],
          }

candidates_df = pd.DataFrame(candidates_info) 

"""
Replace Masters, Master by MS
"""
degrees_to_replace = ["Master", "Masters"]
candidates_df.replace(to_replace = degrees_to_replace, value = "MS", inplace=True)

"""
Replace all the NaN by "Missing"
"""
candidates_df.replace(to_replace=np.nan, value = "Missing", inplace=True)
candidates_df

Unnamed: 0,Full_Name,degree,From,Age
0,Aida Kone,MS,Missing,23
1,Mamadou Diop,MS,Dakar,26
2,Ismael Camara,Bachelor,Bamako,19
3,Aicha Konate,PhD,Abidjan,Missing
4,Fanta Koumare,MS,Konakry,25
5,Khalil Cisse,Missing,Lomé,Missing


## Compare two data frames and get their differences (no files)

In [17]:
import pandas as pd
from pandas.testing import assert_frame_equal

candidates_df = pd.read_csv("data/candidates.csv")

"""
Create a second dataframe by changing "Full_Name" & "Age" columns
"""
candidates_df_test = candidates_df.copy()
candidates_df_test.loc[0, 'Full_Name'] = 'Aida Traore'
candidates_df_test.loc[2, 'Age'] = 28

"""
Compare the two dataframes: candidates_df & candidates_df_test
"""
# 1. Comparison showing only unmatching values
candidates_df.compare(candidates_df_test)

# 2. Comparison including similar values
candidates_df.compare(candidates_df_test, keep_equal=True)

FileNotFoundError: [Errno 2] No such file or directory: 'data/candidates.csv'

## Get a subset of a very large dataset for quick analysis

In [None]:

# Pandas library
import pandas as pd 

# Load execution time
%load_ext autotime

# File to get sample from: Size: 261,6 MB
large_data = "diabetes_benchmark_data.csv"

# Sample size of interest
sample_size = 400

"""
Approach n°1: Read all the data in memory before getting the sample 
"""
read_whole_data = pd.read_csv(large_data)
sample_data = read_whole_data.head(sample_size)

"""
Approach n°2: Read the sample on the fly
"""
read_sample = pd.read_csv(large_data, nrows=sample_size)

## Transform your data frame from a wide to a long format

In [19]:
import pandas as pd

# My experimentation data
candidates= {
    'Name':["Aida","Mamadou","Ismael","Aicha"],
    'ID': [1, 2, 3, 4],
    '2017':[85, 87, 89, 91],
    '2018':[96, 98, 100, 102],
    '2019':[100, 102, 106, 106],
    '2020':[89, 95, 98, 100],
    '2021':[94, 96, 98, 100],
    '2022':[100, 104, 104, 107],
          }
"""
Data in wide format
"""
salary_data = pd.DataFrame(candidates)

"""
Transformation into the long format
"""
long_format_data = salary_data.melt(id_vars=['Name', 'ID'], 
                                    var_name='Year', value_name='Salary(k$)')
long_format_data                                   

Unnamed: 0,Name,ID,Year,Salary(k$)
0,Aida,1,2017,85
1,Mamadou,2,2017,87
2,Ismael,3,2017,89
3,Aicha,4,2017,91
4,Aida,1,2018,96
5,Mamadou,2,2018,98
6,Ismael,3,2018,100
7,Aicha,4,2018,102
8,Aida,1,2019,100
9,Mamadou,2,2019,102


## Reduce the size of your Pandas data frame by ignoring the index

In [20]:
import pandas as pd

# Read data from Github
URL = "https://raw.githubusercontent.com/keitazoumana/Experimentation-Data/main/diabetes.csv"
data = pd.read_csv(URL)

# Create large data by repeating each row 10000 times
large_data = data.loc[data.index.repeat(10000)]

"""
SAVE WITH INDEX
"""
large_data.to_csv("large_data_with_index.csv")

# Check the size of the file 
!ls -GFlash large_data_with_index.csv

"""
SAVE WITHOUT INDEX
"""
large_data.to_csv("large_data_without_index.csv", index = False)

# Check the size of the file 
!ls -GFlash large_data_without_index.csv   

KeyboardInterrupt: 

## Parquet instead of CSV

In [23]:
import pandas as pd

# Read data from Github
URL = "https://raw.githubusercontent.com/keitazoumana/Experimentation-Data/main/diabetes.csv"
data = pd.read_csv(URL)

# Create large data for experimentation by repeating each row 20.000 times
exp_data = data.loc[data.index.repeat(20000)]

"""
EXPERIMENT WITH .CSV FORMAT
"""
# Write Time
%%time 
exp_data.to_csv("exp_data.csv", index=False)

# Read Time
%%time
csv_data = pd.read_csv("exp_data.csv")

# File Size
!ls -GFlash exp_data.csv

"""
EXPERIMENT WITH .PARQUET FORMAT
"""
# Write Time
%%time 
exp_data.to_parquet('exp_data.parquet')

# Read Time
%%time 
parquet_data = pd.read_parquet('exp_data.parquet')

# File Size
!ls -GFlash exp_data.parquet   

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


UsageError: Line magic function `%%time` not found.


## Transform your data frame into a markdown

In [24]:
# ! pip install tabulate
data_URL="https://raw.githubusercontent.com/keitazoumana/Experimentation-Data/main/vgsales.csv"
video_game_data=pd.read_csv(data_URL)
head_df=video_game_data.head()
print(head_df)
print(head_df.to_markdown(tablefmt="grid"))

   Rank                      Name Platform    Year         Genre Publisher  \
0     1                Wii Sports      Wii  2006.0        Sports  Nintendo   
1     2         Super Mario Bros.      NES  1985.0      Platform  Nintendo   
2     3            Mario Kart Wii      Wii  2008.0        Racing  Nintendo   
3     4         Wii Sports Resort      Wii  2009.0        Sports  Nintendo   
4     5  Pokemon Red/Pokemon Blue       GB  1996.0  Role-Playing  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  
0     41.49     29.02      3.77         8.46         82.74  
1     29.08      3.58      6.81         0.77         40.24  
2     15.85     12.88      3.79         3.31         35.82  
3     15.75     11.01      3.28         2.96         33.00  
4     11.27      8.89     10.22         1.00         31.37  
+----+--------+--------------------------+------------+--------+--------------+-------------+------------+------------+------------+---------------+----------------+

## Format Date Time column