### pandas-modules, basic examples, DataFrames

In [278]:
# typical imports 
# pip install numpy
# pip install pandas
import numpy as np
import pandas as pd

In [279]:
# we can get same random values by setting 
# the random seed every time
from numpy.random import randint
np.random.seed(321)

# our rows
companies = ["Jolly Good Toys Ltd", 
             "Pristine Machines Inc.", 
             "Piggy Bank Corporation", 
             "Cars & Gears Foundation"]

# our columns
areas = ["Europe", "North_America", "South_America", "Asia", "Africa", "Other"]

# generate some sales numbers for values with NumPy

sales = randint(99999, size=(4, 6))

# combine everything into a DataFrame. Sales values are between 0 â€“ 99999, random
df = pd.DataFrame(sales, companies, areas)

In [280]:
# since this is already a DataFrame, we don't need 
# to call .to_frame()
df

Unnamed: 0,Europe,North_America,South_America,Asia,Africa,Other
Jolly Good Toys Ltd,97268,6682,4220,43807,5929,91537
Pristine Machines Inc.,24744,69018,99288,14068,90469,94536
Piggy Bank Corporation,28243,44245,21633,78619,45594,37389
Cars & Gears Foundation,57643,46518,71742,99722,48613,97335


**Filtering data within a DataFrame**

In [281]:
# if you need the data of ONE particular column only, we can filter it out:
col_df = df['Europe']

# notice, when filtering only ONE column
# the result is a pandas SERIES
col_df

Jolly Good Toys Ltd        97268
Pristine Machines Inc.     24744
Piggy Bank Corporation     28243
Cars & Gears Foundation    57643
Name: Europe, dtype: int32

In [282]:
# we usually want to filter by multiple columns
# since we usually choose the most optimal variables (columns)
# for our machine learning model for training
columns = ['North_America', 'South_America']
cols_df = df[columns]

# you can also do this in one line:
# cols_df = df[['North_America', 'South_America']]

cols_df

Unnamed: 0,North_America,South_America
Jolly Good Toys Ltd,6682,4220
Pristine Machines Inc.,69018,99288
Piggy Bank Corporation,44245,21633
Cars & Gears Foundation,46518,71742


In [283]:
# NOTE! the column names have to match EXACTLY
# as in the original data (lowercase letters, uppercase etc.)
# handy tool to quickly check the column names:
df.columns

Index(['Europe', 'North_America', 'South_America', 'Asia', 'Africa', 'Other'], dtype='object')

**Creating new columns and fixing existing ones**

In [284]:
# Europe	North_America	South_America	Asia	Africa	Other
df['Total_Sales'] = df['Europe'] + df['North_America'] + df['South_America'] \
                    + df['Asia'] + df['Africa'] + df['Other']

In [285]:
df

Unnamed: 0,Europe,North_America,South_America,Asia,Africa,Other,Total_Sales
Jolly Good Toys Ltd,97268,6682,4220,43807,5929,91537,249443
Pristine Machines Inc.,24744,69018,99288,14068,90469,94536,392123
Piggy Bank Corporation,28243,44245,21633,78619,45594,37389,255723
Cars & Gears Foundation,57643,46518,71742,99722,48613,97335,421573


In [286]:
# THIS IS JUST A SIMULATED DEMONSTRATION TO SHOW WHY REAL DATA NEEDS OFTEN CLEANING
# basically this adds a dollar sign with this code first (we'd never do this with real data)
df['Total_Sales'] = "$ " + df['Total_Sales'].astype(str)

In [287]:
# notice how the Total_Sales is now text due to $ -sign
df

Unnamed: 0,Europe,North_America,South_America,Asia,Africa,Other,Total_Sales
Jolly Good Toys Ltd,97268,6682,4220,43807,5929,91537,$ 249443
Pristine Machines Inc.,24744,69018,99288,14068,90469,94536,$ 392123
Piggy Bank Corporation,28243,44245,21633,78619,45594,37389,$ 255723
Cars & Gears Foundation,57643,46518,71742,99722,48613,97335,$ 421573


In [288]:
# you can use this to see how Total_Sales is currently
# in object -format => text data, can't be used as is to 
# do data analytics or ML applications
# df.dtypes

In [289]:
# if your data has extra character in otherwise numeric columns
# we can clean it easily, e.g.
df['Total_Sales'] = df['Total_Sales'].str.replace("$ ", "").astype(int)

In [290]:
df

Unnamed: 0,Europe,North_America,South_America,Asia,Africa,Other,Total_Sales
Jolly Good Toys Ltd,97268,6682,4220,43807,5929,91537,249443
Pristine Machines Inc.,24744,69018,99288,14068,90469,94536,392123
Piggy Bank Corporation,28243,44245,21633,78619,45594,37389,255723
Cars & Gears Foundation,57643,46518,71742,99722,48613,97335,421573


In [291]:
# if you ever need to double-check if all columns/variables
# are surely numbers, remember NumPy's dtype
df.dtypes

Europe           int32
North_America    int32
South_America    int32
Asia             int32
Africa           int32
Other            int32
Total_Sales      int64
dtype: object

In [292]:
# let's make another column based on something else
df['Monthly_Sales'] = round(df['Total_Sales'] / 12, 2)

In [293]:
df

Unnamed: 0,Europe,North_America,South_America,Asia,Africa,Other,Total_Sales,Monthly_Sales
Jolly Good Toys Ltd,97268,6682,4220,43807,5929,91537,249443,20786.92
Pristine Machines Inc.,24744,69018,99288,14068,90469,94536,392123,32676.92
Piggy Bank Corporation,28243,44245,21633,78619,45594,37389,255723,21310.25
Cars & Gears Foundation,57643,46518,71742,99722,48613,97335,421573,35131.08


**Deleting (dropping) unneeded columns**

In [294]:
# a typical pandas-notebook starts with a list of
# columns we want to immediately delete (drop)

# remember axis=1 for COLUMNS, otherwise code tries 
# to drop a row
# and we don't have a row called Monthly_Sales
df = df.drop("Monthly_Sales", axis=1)

# dropping rows is fairly rare (but possible)
# df = df.drop("Pristine Machines Inc.")

In [295]:
# OPTION 2:
# inplace -parameter allows us to modify a DataFrame
# WITHOUT reassigning the DataFrame back to itself
# e.g. df.drop('Monthly_Sales', axis=1, inplace=True)

In [296]:
# we can now see that Monthly_Sales has been 
# removed from the columns
df

Unnamed: 0,Europe,North_America,South_America,Asia,Africa,Other,Total_Sales
Jolly Good Toys Ltd,97268,6682,4220,43807,5929,91537,249443
Pristine Machines Inc.,24744,69018,99288,14068,90469,94536,392123
Piggy Bank Corporation,28243,44245,21633,78619,45594,37389,255723
Cars & Gears Foundation,57643,46518,71742,99722,48613,97335,421573


In [297]:
# you can use loc to look up a certain row in the data
# usually problematic or extreme rows in the data (outliers)
row = df.loc["Piggy Bank Corporation"]
row.to_frame()

Unnamed: 0,Piggy Bank Corporation
Europe,28243
North_America,44245
South_America,21633
Asia,78619
Africa,45594
Other,37389
Total_Sales,255723


In [298]:
df

Unnamed: 0,Europe,North_America,South_America,Asia,Africa,Other,Total_Sales
Jolly Good Toys Ltd,97268,6682,4220,43807,5929,91537,249443
Pristine Machines Inc.,24744,69018,99288,14068,90469,94536,392123
Piggy Bank Corporation,28243,44245,21633,78619,45594,37389,255723
Cars & Gears Foundation,57643,46518,71742,99722,48613,97335,421573


In [None]:
# use numeric index instead, notice the i in iloc
row = df.iloc[1]
row.to_frame()

# df.iloc is often used to filter a smaller part from a huge data
# for example, first 1000 rows only
# df_smaller = df.iloc[0:1000]

Unnamed: 0,Pristine Machines Inc.
Europe,24744
North_America,69018
South_America,99288
Asia,14068
Africa,90469
Other,94536
Total_Sales,392123


**Conditional filtering in pandas**