In [1]:
import pandas as pd

In [2]:
cities = ["Philadelphia", "Boston","Baltimore", "Orlando"]

In [3]:
states = ["PA", "MA","MD", "FL"]

In [4]:
population = [1.5,0.6,0.6,2.35]

In [5]:
visitors = [41,19,24,66]

In [6]:
list_labels = ["City","State", "Population","Visitors"]

In [8]:
list_cols = [cities,states,population,visitors]
list_cols

[['Philadelphia', 'Boston', 'Baltimore', 'Orlando'],
 ['PA', 'MA', 'MD', 'FL'],
 [1.5, 0.6, 0.6, 2.35],
 [41, 19, 24, 66]]

In [10]:
zipped = list(zip(list_labels, list_cols))
data = dict(zipped)
data

{'City': ['Philadelphia', 'Boston', 'Baltimore', 'Orlando'],
 'Population': [1.5, 0.6, 0.6, 2.35],
 'State': ['PA', 'MA', 'MD', 'FL'],
 'Visitors': [41, 19, 24, 66]}

In [11]:
df = pd.DataFrame(data)
df

Unnamed: 0,City,Population,State,Visitors
0,Philadelphia,1.5,PA,41
1,Boston,0.6,MA,19
2,Baltimore,0.6,MD,24
3,Orlando,2.35,FL,66


In [12]:
df.index = df["State"]
df

Unnamed: 0_level_0,City,Population,State,Visitors
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PA,Philadelphia,1.5,PA,41
MA,Boston,0.6,MA,19
MD,Baltimore,0.6,MD,24
FL,Orlando,2.35,FL,66


In [13]:
# for loop and iterrows()
# Iterate over DataFrame rows as (index, Series) pairs

for st, row in df.iterrows():
    print("st",st)
    print("row",row)

st PA
row City          Philadelphia
Population             1.5
State                   PA
Visitors                41
Name: PA, dtype: object
st MA
row City          Boston
Population       0.6
State             MA
Visitors          19
Name: MA, dtype: object
st MD
row City          Baltimore
Population          0.6
State                MD
Visitors             24
Name: MD, dtype: object
st FL
row City          Orlando
Population       2.35
State              FL
Visitors           66
Name: FL, dtype: object


In [14]:
# Now lets get the data we need
for st, row in df.iterrows():
    print(st + ": " + row["City"])

PA: Philadelphia
MA: Boston
MD: Baltimore
FL: Orlando


In [16]:
# now lets do some math, suppose we need to count chr in City and save it in a new col
for st, row in df.iterrows():
    df.loc[st, "city_length"] = len(row["City"])
df

Unnamed: 0_level_0,City,Population,State,Visitors,city_length
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PA,Philadelphia,1.5,PA,41,12.0
MA,Boston,0.6,MA,19,6.0
MD,Baltimore,0.6,MD,24,9.0
FL,Orlando,2.35,FL,66,7.0


In [17]:
# however it is not the most efficient way to do it especially for a big data set

df["new_city_length"] = df["City"].apply(len)
df

Unnamed: 0_level_0,City,Population,State,Visitors,city_length,new_city_length
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PA,Philadelphia,1.5,PA,41,12.0,12
MA,Boston,0.6,MA,19,6.0,6
MD,Baltimore,0.6,MD,24,9.0,9
FL,Orlando,2.35,FL,66,7.0,7


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, PA to FL
Data columns (total 6 columns):
City               4 non-null object
Population         4 non-null float64
State              4 non-null object
Visitors           4 non-null int64
city_length        4 non-null float64
new_city_length    4 non-null int64
dtypes: float64(2), int64(2), object(2)
memory usage: 224.0+ bytes


In [20]:
# filtering DataFrame 
df.new_city_length > 7

State
PA     True
MA    False
MD     True
FL    False
Name: new_city_length, dtype: bool

In [21]:
# custom filtering
greater_than = df.new_city_length > 7
df[greater_than]

Unnamed: 0_level_0,City,Population,State,Visitors,city_length,new_city_length
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PA,Philadelphia,1.5,PA,41,12.0,12
MD,Baltimore,0.6,MD,24,9.0,9


In [22]:
# or we could do
df[df.new_city_length > 7]

Unnamed: 0_level_0,City,Population,State,Visitors,city_length,new_city_length
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PA,Philadelphia,1.5,PA,41,12.0,12
MD,Baltimore,0.6,MD,24,9.0,9


In [23]:
# we can combine filters and
df[(df.new_city_length > 7) & (df.new_city_length < 10)]

Unnamed: 0_level_0,City,Population,State,Visitors,city_length,new_city_length
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MD,Baltimore,0.6,MD,24,9.0,9


In [24]:
# we can combine filters or
df[(df.new_city_length == 9) | (df.new_city_length >= 10)]

Unnamed: 0_level_0,City,Population,State,Visitors,city_length,new_city_length
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PA,Philadelphia,1.5,PA,41,12.0,12
MD,Baltimore,0.6,MD,24,9.0,9


In [25]:
# filtering a column based on another
df.City[df.new_city_length == 7]

State
FL    Orlando
Name: City, dtype: object

In [27]:
# from ipykernel import kernelapp as app

In [28]:
# modifying a column based on another
df.Visitors[df.new_city_length == 6] *= 2
df

Unnamed: 0_level_0,City,Population,State,Visitors,city_length,new_city_length
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PA,Philadelphia,1.5,PA,41,12.0,12
MA,Boston,0.6,MA,76,6.0,6
MD,Baltimore,0.6,MD,24,9.0,9
FL,Orlando,2.35,FL,66,7.0,7


In [29]:
# creating methods within DataFrames
def hundreds(n):
    return n/100

df["Visitors"].apply(hundreds)

State
PA    0.41
MA    0.76
MD    0.24
FL    0.66
Name: Visitors, dtype: float64

In [30]:
# same result with lambda function
df["Visitors"].apply(lambda n: n/100)

State
PA    0.41
MA    0.76
MD    0.24
FL    0.66
Name: Visitors, dtype: float64

In [31]:
# now we can store this data
df["Visitors/hundreds"] = df.Visitors.apply(lambda n: n/100)
df

Unnamed: 0_level_0,City,Population,State,Visitors,city_length,new_city_length,Visitors/hundreds
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PA,Philadelphia,1.5,PA,41,12.0,12,0.41
MA,Boston,0.6,MA,76,6.0,6,0.76
MD,Baltimore,0.6,MD,24,9.0,9,0.24
FL,Orlando,2.35,FL,66,7.0,7,0.66


In [32]:
# manipulating strings
df.City = df.City.str.upper()
df

Unnamed: 0_level_0,City,Population,State,Visitors,city_length,new_city_length,Visitors/hundreds
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PA,PHILADELPHIA,1.5,PA,41,12.0,12,0.41
MA,BOSTON,0.6,MA,76,6.0,6,0.76
MD,BALTIMORE,0.6,MD,24,9.0,9,0.24
FL,ORLANDO,2.35,FL,66,7.0,7,0.66


In [33]:
# columns calculations
df["Total_poulation"] = df.Population + df.Visitors
df

Unnamed: 0_level_0,City,Population,State,Visitors,city_length,new_city_length,Visitors/hundreds,Total_poulation
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
PA,PHILADELPHIA,1.5,PA,41,12.0,12,0.41,42.5
MA,BOSTON,0.6,MA,76,6.0,6,0.76,76.6
MD,BALTIMORE,0.6,MD,24,9.0,9,0.24,24.6
FL,ORLANDO,2.35,FL,66,7.0,7,0.66,68.35
