# Data Wrangling: Join, Combine, and Reshape
<ol>
    <li><b>Hierarchical Indexing (Series and DataFrames) </b></li>
    <ul><li>creating index </li>
        <li>accessing (selecting) by index </li>
        <li>sorting by index</li>
        <li>Summary statistics by level</li>
        <li>Indexing with dataframes' columns</li>
    </ul>
    <li><b>Combining and Merging dfs</b></li>
    
</ol>

In [52]:
import pandas as pd
import numpy as np

In [183]:
#1 Hierarchical Indexing: Series
#---create a series with 2-level index
multi_index = [['animal', 'animal', 'fruit', 'fruit'],
                ['good_counts', 'bad_counts', 'good_counts', 'bad_counts']]

s = pd.Series(data = np.random.randint(low = 5, high = 10, size = 4, 
                                          dtype=int), index = multi_index)

s

animal  good_counts    5
        bad_counts     7
fruit   good_counts    6
        bad_counts     9
dtype: int64

In [184]:
s.index

MultiIndex([('animal', 'good_counts'),
            ('animal',  'bad_counts'),
            ( 'fruit', 'good_counts'),
            ( 'fruit',  'bad_counts')],
           )

In [188]:
#1 Hierarchical Indexing: Series
#---accessing elments: one element/more than one element, slicing
#---note: (1) slicing only works if the index is in the increasing order. 
#---      This meeans in the above example, if you put 'fruit' and then 'animal'
#---      for the index, the slicing s['fruit':'animal'] will not work
#---      (2)here think as label-indexing 

#---first level: 

s.animal
s['animal']
s[['animal', 'fruit']]
s['animal':'fruit']

#s.loc['animal']
#s.loc[['animal', 'fruit']]
#s.loc['animal':'fruit']

#your turn
#---second level: the point here is to think of the outcome of 
#---              the first level accessing

s.animal.good_counts
s[('animal', 'good_counts')]

5

In [89]:
#1 Hierarchical Indexing: DataFrames
#---create a df with 2-level index for rows

df = pd.DataFrame(np.random.randint(low = 1, high = 10, size = 12).reshape((4, 3)),
                      index=[["animal", "animal", "fruit", "fruit"], 
                             ['good_counts', 'bad_counts', 
                              'good_counts', 'bad_counts']],
                    columns=['IA', 'MA', 'NY'])

df


Unnamed: 0,Unnamed: 1,IA,MA,NY
animal,good_counts,6,1,3
animal,bad_counts,3,8,4
fruit,good_counts,8,6,6
fruit,bad_counts,2,4,1


In [189]:
#1 Hierarchical Indexing: DataFrames
#---create a df with 2-level index for columns

df = pd.DataFrame(np.random.randint(low = 1, high = 10, size = 16).reshape((4, 4)),
                    columns=[['IA', 'IA', 'MA', 'MA'], 
                             ['North', 'South', 'North', 'South']])
df

Unnamed: 0_level_0,IA,IA,MA,MA
Unnamed: 0_level_1,North,South,North,South
0,4,7,2,6
1,7,8,3,9
2,8,2,5,8
3,2,4,6,1


In [190]:
#1 Hierarchical Indexing: DataFrames
#---create a df with 2-level index for rows and columns 

df = pd.DataFrame(np.random.randint(low = 1, high = 10, size = 16).reshape((4, 4)),
                      index=[["animal", "animal", "fruit", "fruit"], 
                             ['good_counts', 'bad_counts', 
                              'good_counts', 'bad_counts']],
                  columns=[['IA', 'IA', 'MA', 'MA'], 
                             ['North', 'South', 'North', 'South']])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,IA,IA,MA,MA
Unnamed: 0_level_1,Unnamed: 1_level_1,North,South,North,South
animal,good_counts,3,4,1,1
animal,bad_counts,5,6,6,9
fruit,good_counts,4,1,2,3
fruit,bad_counts,2,4,5,4


In [199]:
#1 Hierarchical Indexing: DataFrames
#--- accessing (selecting) data: column(s), row(s) 

df.IA
df[['IA', 'MA']]

df.IA['North']
#your turn: take all North columns
df[[('IA', 'North'),('MA', 'North')]]

df[0:1]
df[0:2]
df.loc['animal']
df.loc['animal':'fruit']

df.loc[('animal', 'good_counts')]

#your turn:
#--- take all good_counts rows of IA
df.IA.loc[[('animal','good_counts'),('fruit','good_counts')]]

df.loc[(('animal', 'fruit'), ('good_counts')), ('IA')]



Unnamed: 0,Unnamed: 1,North,South
animal,good_counts,3,4
fruit,good_counts,4,1


In [124]:
#1 Hierarchical indexing: sort_index
#--- series

multi_index = [['fruit', 'fruit', 'animal', 'animal'],
                ['good_counts', 'bad_counts', 'good_counts', 'bad_counts']]

s = pd.Series(data = np.random.randint(low = 5, high = 10, size = 4, 
                                          dtype=int), index = multi_index)
s1 = s.sort_index(level = 0, inplace = False, ascending = True)
s1

animal  bad_counts     8
        good_counts    7
fruit   bad_counts     9
        good_counts    7
dtype: int64

In [126]:
#1 Hierarchical indexing: sort_index
#--- df
df = pd.DataFrame(np.random.randint(low = 1, high = 10, size = 16).reshape((4, 4)),
                      index=[["animal", "animal", "fruit", "fruit"], 
                             ['good_counts', 'bad_counts', 
                              'good_counts', 'bad_counts']],
                  columns=[['IA', 'IA', 'MA', 'MA'], 
                             ['North', 'South', 'North', 'South']])
df1 = df.sort_index(level = 1, inplace = False, ascending = False, axis = 'columns')
df1


Unnamed: 0_level_0,Unnamed: 1_level_0,MA,IA,MA,IA
Unnamed: 0_level_1,Unnamed: 1_level_1,South,South,North,North
animal,good_counts,4,2,3,5
animal,bad_counts,1,9,3,1
fruit,good_counts,4,3,7,2
fruit,bad_counts,7,9,6,8


In [202]:
#1 Hierarchical indexing: Summary statistics by level
#---for series

multi_index = [['animal', 'animal', 'fruit', 'fruit'],
                ['good_counts', 'bad_counts', 'good_counts', 'bad_counts']]

s = pd.Series(data = np.random.randint(low = 5, high = 10, size = 4, 
                                          dtype=int), index = multi_index)
s

s1 = s.groupby(level = 0).sum()
s1
#your turn: how many good_counts, how many bad_counts?
s1 = s.groupby(level = 1).sum()
s1


bad_counts     15
good_counts    13
dtype: int64

In [207]:
#1 Hierarchical indexing: Summary statistics by level
#---for df

df = pd.DataFrame(np.random.randint(low = 1, high = 10, size = 16).reshape((4, 4)),
                      index=[["animal", "animal", "fruit", "fruit"], 
                             ['good_counts', 'bad_counts', 
                              'good_counts', 'bad_counts']],
                  columns=[['IA', 'IA', 'MA', 'MA'], 
                             ['North', 'South', 'North', 'South']])
df
df1 = df.groupby(level = 0, axis = 'rows').sum()
df1

df2 = df.groupby(level = 0, axis = 'columns').sum()
df2
#your turn: how many good_counts, bad_counts for each state
df3 = df2.groupby(level=1,axis='rows').sum()
df3


Unnamed: 0,IA,MA
bad_counts,14,19
good_counts,28,24


In [212]:
#1 Hierarchical indexing: indexing with dataframes' columns
#---set_index()

df = pd.DataFrame({"a": range(7), "b": range(7, 0, -1),
                          "c": ["one", "one", "one", "two", "two",
                                "two", "two"],
                          "d": [0, 1, 2, 0, 1, 2, 3]})
df

df1 = df.set_index(['c', 'd'], inplace = False, drop=True)
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [213]:
#1 Hierarchical indexing: indexing with dataframes' columns
#---reset_index() is to set default index to a df
df2 = df1.reset_index()
df2


Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [214]:
#2: df.merge

df1 = pd.DataFrame({"key1": ["a", "b", "c", "d"],
                    "data1": np.arange(start = 1, stop = 5, step = 1)})

df2 = pd.DataFrame({"key2": ["c", "d", "e", "f"],
                    "data2": np.arange(start = 11, stop = 15, step = 1)})

df = pd.merge(df1, df2, left_on = "key1", right_on = "key2", how = "inner")

#your turn: use the venn diagram I drew on the whiteboard to understand merging types
#---by setting up "how"
#--- = "left" | "right" | "inner" | "outer"
#---understand how = "cross" as well
#---some time you can use index instead of left_on or right_on
#---but I prefer left_on and right_on since the code is more readable

Unnamed: 0,key1,data1,key2,data2
0,a,1.0,,
1,b,2.0,,
2,c,3.0,c,11.0
3,d,4.0,d,12.0
4,,,e,13.0
5,,,f,14.0


In [232]:
#2 merge: more
students= pd.DataFrame ([["11","aa"],["22","bb"],["33","cc"],["44","dd"],["55","ee"]], columns=("sID","full_name"))

courses = pd.DataFrame ([["CS 150","Intro to CS"],["CS 252","OPP with Java"],["DS 120","Intro to DS"],["DS 320","Data Vis"],["DS 420","ML"]], columns=("cID","name"))

sc = pd.DataFrame([["11","CS 150", "A","Fall","2022"],["22","CS 150", "A","SP","2023"],["33","CS 150","B","Fall","2022"],
                   ["44","DS 120", "A", "SP", "2023"],["55","DS 120", "B","Fall","2022"],
                   ["11","CS 252", "A","SP","2023"],["22","CS 252", "A","SP","2023"],["33","DS 320","A","SP","2023"],
                   ["44","DS 320", "B", "SP", "2023"],["55","DS 420", "A","Fall","2022"]], columns = ("sID","cID","Grades","Semester","Year"))


sc

Unnamed: 0,sID,cID,Grades,Semester,Year
0,11,CS 150,A,Fall,2022
1,22,CS 150,A,SP,2023
2,33,CS 150,B,Fall,2022
3,44,DS 120,A,SP,2023
4,55,DS 120,B,Fall,2022
5,11,CS 252,A,SP,2023
6,22,CS 252,A,SP,2023
7,33,DS 320,A,SP,2023
8,44,DS 320,B,SP,2023
9,55,DS 420,A,Fall,2022


In [234]:
#1
df = pd.merge(students, sc, left_on = "sID", right_on = "sID",
              suffixes = ('_student', '_SC'),  how = "inner")

df = df[['sID', 'full_name', 'cID', 'Grades']]

df
 

Unnamed: 0,sID,full_name,cID,Grades
0,11,aa,CS 150,A
1,11,aa,CS 252,A
2,22,bb,CS 150,A
3,22,bb,CS 252,A
4,33,cc,CS 150,B
5,33,cc,DS 320,A
6,44,dd,DS 120,A
7,44,dd,DS 320,B
8,55,ee,DS 120,B
9,55,ee,DS 420,A


In [221]:
#2 df.concat()
#recall that concat() use index to concatnate series or dataframes
#it returns a series if you concatenate more than 2 series and set axis = 'rows'
#it returns a df if you set axis = 'columns'

df1 = pd.DataFrame({"key1": ["a", "b", "c", "d"],
                    "data1": np.arange(start = 1, stop = 5, step = 1)})

df2 = pd.DataFrame({"key2": ["c", "d", "e", "f", "g"],
                    "data2": np.arange(start = 11, stop = 16, step = 1)})

#your turn

#1 concatenate 2 keys from 2 df1 and df2 into a series
#2 concatenate df1 and df2 by rows 
#3 concatenate df1 and df2 by columns
#4 what are the difference(s) between pd.concat() and pd.merge() 


Unnamed: 0,key1,key2
0,a,c
1,b,d
2,c,e
3,d,f


In [None]:
#Reading: combine_first(), reshape() and pivoting

