## Combining datasets
Pandas provide various facilities for easily combining togheter Series or Dataframes.
- **Concat:** combining DataFrames across rows or columns. 
- **Join:** combining data on a key column or an index.
- **Merge:** combining data on common columns or indexes. 

More info here: https://pandas.pydata.org/docs/user_guide/merging.html

In [1]:
from helpers import sample_df, hdisplay, nowrap_display
import pandas as pd


In [11]:
# Sample data
left = sample_df("A0", "D3", prefix="L_")
right = sample_df("A0", "D3", prefix="R_")

# display(left)                      # Gör samma sak som sista rad på jupyter NB, men kan göras för flera
# display(right)

hdisplay([left, right], ["Left", "Right"])



Unnamed: 0,A,B,C,D
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3

Unnamed: 0,A,B,C,D
0,R_A0,R_B0,R_C0,R_D0
1,R_A1,R_B1,R_C1,R_D1
2,R_A2,R_B2,R_C2,R_D2
3,R_A3,R_B3,R_C3,R_D3


In [47]:
hdisplay([
pd.concat([left, right], axis = "index"),
pd.concat([left, right], axis = "columns")],              # default är axis = index (slår ihop under ifrån), jmfrt med columns som slår ihop vid sidan om varandra
["axis = 'index'","axis= 'columns'"] 
)

Unnamed: 0,A,B,C,D,E,F
0,L_A0,L_B0,L_C0,L_D0,,
1,L_A1,L_B1,L_C1,L_D1,,
2,L_A2,L_B2,L_C2,L_D2,,
3,L_A3,L_B3,L_C3,L_D3,,
2,,,R_C2,R_D2,R_E2,R_F2
3,,,R_C3,R_D3,R_E3,R_F3
4,,,R_C4,R_D4,R_E4,R_F4
5,,,R_C5,R_D5,R_E5,R_F5

Unnamed: 0,A,B,C,D,C.1,D.1,E,F
0,L_A0,L_B0,L_C0,L_D0,,,,
1,L_A1,L_B1,L_C1,L_D1,,,,
2,L_A2,L_B2,L_C2,L_D2,R_C2,R_D2,R_E2,R_F2
3,L_A3,L_B3,L_C3,L_D3,R_C3,R_D3,R_E3,R_F3
4,,,,,R_C4,R_D4,R_E4,R_F4
5,,,,,R_C5,R_D5,R_E5,R_F5


In [17]:
pd.concat([left,right])          # Slår ihop med samma index som io ursprunglig DataFrame, men ofta vill vi ha unika index.

Unnamed: 0,A,B,C,D
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3
0,R_A0,R_B0,R_C0,R_D0
1,R_A1,R_B1,R_C1,R_D1
2,R_A2,R_B2,R_C2,R_D2
3,R_A3,R_B3,R_C3,R_D3


In [21]:
# pd.concat([left,right]).reset_index(drop=True)         # Resettar nytt index från 0 och uppåt. skapar nytt index o gör nytt, med drop slänger bara det gamla o kör igen från början
# pd.concat([left, right]).set_index("C")                # Sätter en kolumns värden som index

hdisplay([
pd.concat([left, right], ignore_index = False), 
pd.concat([left, right], ignore_index = True),    # Gör samma som första med drop = True. Denna metod can slängas på vilken data som helst, räknar om index oavsett vad man slår ihop.
pd.concat([left, right], axis = "columns", ignore_index = True)],            # Här blir kolumns siffror istället. 
["ignore_index = False", "ignore_index = True", "axis = 'columns', ignore_index = True"]
)


Unnamed: 0,A,B,C,D
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3
0,R_A0,R_B0,R_C0,R_D0
1,R_A1,R_B1,R_C1,R_D1
2,R_A2,R_B2,R_C2,R_D2
3,R_A3,R_B3,R_C3,R_D3

Unnamed: 0,A,B,C,D
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3
4,R_A0,R_B0,R_C0,R_D0
5,R_A1,R_B1,R_C1,R_D1
6,R_A2,R_B2,R_C2,R_D2
7,R_A3,R_B3,R_C3,R_D3

Unnamed: 0,0,1,2,3,4,5,6,7
0,L_A0,L_B0,L_C0,L_D0,R_A0,R_B0,R_C0,R_D0
1,L_A1,L_B1,L_C1,L_D1,R_A1,R_B1,R_C1,R_D1
2,L_A2,L_B2,L_C2,L_D2,R_A2,R_B2,R_C2,R_D2
3,L_A3,L_B3,L_C3,L_D3,R_A3,R_B3,R_C3,R_D3


In [23]:
# New sample data
left = sample_df("A0", "D3", prefix="L_")
right = sample_df("C2", "F5", prefix="R_")

hdisplay([left, right], ["Left", "Right"])

Unnamed: 0,A,B,C,D
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3

Unnamed: 0,C,D,E,F
2,R_C2,R_D2,R_E2,R_F2
3,R_C3,R_D3,R_E3,R_F3
4,R_C4,R_D4,R_E4,R_F4
5,R_C5,R_D5,R_E5,R_F5


In [25]:
pd.concat([left, right])

Unnamed: 0,A,B,C,D,E,F
0,L_A0,L_B0,L_C0,L_D0,,
1,L_A1,L_B1,L_C1,L_D1,,
2,L_A2,L_B2,L_C2,L_D2,,
3,L_A3,L_B3,L_C3,L_D3,,
2,,,R_C2,R_D2,R_E2,R_F2
3,,,R_C3,R_D3,R_E3,R_F3
4,,,R_C4,R_D4,R_E4,R_F4
5,,,R_C5,R_D5,R_E5,R_F5


In [33]:
hdisplay([
pd.concat([left, right], axis = "index"),
pd.concat([left, right], axis = "columns", join = 'outer'),               # outer är default, tar med allt från båda  #union
pd.concat([left, right], axis = "columns", join = 'inner')],              # inner tar med det som matchar - #intersection, columner och rader som finns i båda gemensamt behålls bara.
["axis = 'index'", "axis = 'columns'", "axis = 'columns'"], 
20                                                                        # Space parameter som säger hur många pixlar det ska vara emellan.
)


Unnamed: 0,A,B,C,D,E,F
0,L_A0,L_B0,L_C0,L_D0,,
1,L_A1,L_B1,L_C1,L_D1,,
2,L_A2,L_B2,L_C2,L_D2,,
3,L_A3,L_B3,L_C3,L_D3,,
2,,,R_C2,R_D2,R_E2,R_F2
3,,,R_C3,R_D3,R_E3,R_F3
4,,,R_C4,R_D4,R_E4,R_F4
5,,,R_C5,R_D5,R_E5,R_F5

Unnamed: 0,A,B,C,D,C.1,D.1,E,F
0,L_A0,L_B0,L_C0,L_D0,,,,
1,L_A1,L_B1,L_C1,L_D1,,,,
2,L_A2,L_B2,L_C2,L_D2,R_C2,R_D2,R_E2,R_F2
3,L_A3,L_B3,L_C3,L_D3,R_C3,R_D3,R_E3,R_F3
4,,,,,R_C4,R_D4,R_E4,R_F4
5,,,,,R_C5,R_D5,R_E5,R_F5

Unnamed: 0,A,B,C,D,C.1,D.1,E,F
2,L_A2,L_B2,L_C2,L_D2,R_C2,R_D2,R_E2,R_F2
3,L_A3,L_B3,L_C3,L_D3,R_C3,R_D3,R_E3,R_F3


In [35]:
pd.concat([left, right], keys = ['left', 'right'])         # Skapar MultiLevelIndex (rader med två kategorier)
 

Unnamed: 0,Unnamed: 1,A,B,C,D,E,F
left,0,L_A0,L_B0,L_C0,L_D0,,
left,1,L_A1,L_B1,L_C1,L_D1,,
left,2,L_A2,L_B2,L_C2,L_D2,,
left,3,L_A3,L_B3,L_C3,L_D3,,
right,2,,,R_C2,R_D2,R_E2,R_F2
right,3,,,R_C3,R_D3,R_E3,R_F3
right,4,,,R_C4,R_D4,R_E4,R_F4
right,5,,,R_C5,R_D5,R_E5,R_F5


In [36]:

df = pd.concat([left, right], keys = ['left', 'right']) 
df.loc['right']         # blir ett sätt att konkatenera två dataFrames men att hålla reda på vilket som kommer från vilket.
                        # Då kan man sortera tillbaka sedan om man vill till ursprungliga DataFrames    


# df.loc[:, 'right']         # om axis = 'columns'

Unnamed: 0,A,B,C,D,E,F
2,,,R_C2,R_D2,R_E2,R_F2
3,,,R_C3,R_D3,R_E3,R_F3
4,,,R_C4,R_D4,R_E4,R_F4
5,,,R_C5,R_D5,R_E5,R_F5


In [43]:
hdisplay([
    pd.concat([left, right], axis='index', keys=['left', 'right']),
    pd.concat([left, right], axis='columns', keys=['left', 'right'])],
    ["axis='index', keys=['left', 'right']", "axis='columns', keys=['left', 'right']"]
)
# If were are to index on the multiLevelindex when axis = 'columns', then we have to type .loc[:,'right']



Unnamed: 0,Unnamed: 1,A,B,C,D,E,F
left,0,L_A0,L_B0,L_C0,L_D0,,
left,1,L_A1,L_B1,L_C1,L_D1,,
left,2,L_A2,L_B2,L_C2,L_D2,,
left,3,L_A3,L_B3,L_C3,L_D3,,
right,2,,,R_C2,R_D2,R_E2,R_F2
right,3,,,R_C3,R_D3,R_E3,R_F3
right,4,,,R_C4,R_D4,R_E4,R_F4
right,5,,,R_C5,R_D5,R_E5,R_F5

Unnamed: 0_level_0,left,left,left,left,right,right,right,right
Unnamed: 0_level_1,A,B,C,D,C,D,E,F
0,L_A0,L_B0,L_C0,L_D0,,,,
1,L_A1,L_B1,L_C1,L_D1,,,,
2,L_A2,L_B2,L_C2,L_D2,R_C2,R_D2,R_E2,R_F2
3,L_A3,L_B3,L_C3,L_D3,R_C3,R_D3,R_E3,R_F3
4,,,,,R_C4,R_D4,R_E4,R_F4
5,,,,,R_C5,R_D5,R_E5,R_F5


### Concat

Pandas .concat() method concatenates dataframe row- or columnwise, <br>
with optional set logic (union or intersection) of the indexes on the other axis.

Concat simply stacks multiple dataframes together either vertically or horizontally after aligning on rows/columns. 

Detta alignar på kolumner och rader, för att kunna aligna på data istället, titta på join och merge


## Join
Join first aligns the index of two dataframes and then pick up the remanining columns from the aligned rows of each dataframe.

The **how** parameter can be any of the following:
- **left:** Get *all* rows from the left table, and join matching rows from the right table. **(default)**
- **right:** Get *all* rows from the right table, and join matching rows from the left table. 
- **outer:** Get *all* rows from both tables, join all matching rows on both sides. (union)
- **inner:** Get only rows that exists on both tables. (intersection)
- **cross:** Get every possible combination of rows from both tables. (används sällan men är grunden till hur de andra fungerar). Length of new table == len(left) * len(right)  (gjort alla kombinationer som finns)

In [48]:
# New sample data

left = sample_df("A0", "D3", prefix = "L")
right = sample_df("A0", "F5", prefix = "R")

hdisplay([left, right], ["Left", "Right"]
)


Unnamed: 0,A,B,C,D
0,LA0,LB0,LC0,LD0
1,LA1,LB1,LC1,LD1
2,LA2,LB2,LC2,LD2
3,LA3,LB3,LC3,LD3

Unnamed: 0,A,B,C,D,E,F
0,RA0,RB0,RC0,RD0,RE0,RF0
1,RA1,RB1,RC1,RD1,RE1,RF1
2,RA2,RB2,RC2,RD2,RE2,RF2
3,RA3,RB3,RC3,RD3,RE3,RF3
4,RA4,RB4,RC4,RD4,RE4,RF4
5,RA5,RB5,RC5,RD5,RE5,RF5


In [53]:
left.join(right, lsuffix="_L", rsuffix= "_Right")                       # begrepp när man joinar är left och right, standard terminologi  
                                                                        # går att göra redan i tidigare steg.


Unnamed: 0,A_L,B_L,C_L,D_L,A_Right,B_Right,C_Right,D_Right,E,F
0,LA0,LB0,LC0,LD0,RA0,RB0,RC0,RD0,RE0,RF0
1,LA1,LB1,LC1,LD1,RA1,RB1,RC1,RD1,RE1,RF1
2,LA2,LB2,LC2,LD2,RA2,RB2,RC2,RD2,RE2,RF2
3,LA3,LB3,LC3,LD3,RA3,RB3,RC3,RD3,RE3,RF3


In [62]:
# New sample data

left = sample_df("A0", "D3", prefix = "L").add_prefix("L")
right = sample_df("C2", "F5", prefix = "R").add_prefix("R")

hdisplay([left, right], ["Left", "Right"]
)

# Lägger till prefix här till båda för att inte ha problem när man joinar sen.


Unnamed: 0,LA,LB,LC,LD
0,LA0,LB0,LC0,LD0
1,LA1,LB1,LC1,LD1
2,LA2,LB2,LC2,LD2
3,LA3,LB3,LC3,LD3

Unnamed: 0,RC,RD,RE,RF
2,RC2,RD2,RE2,RF2
3,RC3,RD3,RE3,RF3
4,RC4,RD4,RE4,RF4
5,RC5,RD5,RE5,RF5


In [63]:
hdisplay([
left.join(right, how = "left"),           # how: beskriver hur de ska joinas ihop, viktigaste delen. Left är default
left.join(right, how = "right")],
["how = 'left'","how = 'right'" ]
)

# Varför blev som nedan? 

Unnamed: 0,LA,LB,LC,LD,RC,RD,RE,RF
0,LA0,LB0,LC0,LD0,,,,
1,LA1,LB1,LC1,LD1,,,,
2,LA2,LB2,LC2,LD2,RC2,RD2,RE2,RF2
3,LA3,LB3,LC3,LD3,RC3,RD3,RE3,RF3

Unnamed: 0,LA,LB,LC,LD,RC,RD,RE,RF
2,LA2,LB2,LC2,LD2,RC2,RD2,RE2,RF2
3,LA3,LB3,LC3,LD3,RC3,RD3,RE3,RF3
4,,,,,RC4,RD4,RE4,RF4
5,,,,,RC5,RD5,RE5,RF5


In [69]:
left = sample_df("A0", "D3", prefix = "L").add_prefix("L")
right = sample_df("C2", "F5", prefix = "R").add_prefix("R")

hdisplay([left, right], ["Left", "Right"]
)

# Lägger till prefix här till båda för att inte ha problem när man joinar sen.

Unnamed: 0,LA,LB,LC,LD
0,LA0,LB0,LC0,LD0
1,LA1,LB1,LC1,LD1
2,LA2,LB2,LC2,LD2
3,LA3,LB3,LC3,LD3

Unnamed: 0,RC,RD,RE,RF
2,RC2,RD2,RE2,RF2
3,RC3,RD3,RE3,RF3
4,RC4,RD4,RE4,RF4
5,RC5,RD5,RE5,RF5


In [66]:
hdisplay([
left.join(right, how = "inner"),           # inner = intersection
left.join(right, how = "outer")],          # outer = union 
["how = 'inner'","how = 'outer'" ]
)


Unnamed: 0,LA,LB,LC,LD,RC,RD,RE,RF
2,LA2,LB2,LC2,LD2,RC2,RD2,RE2,RF2
3,LA3,LB3,LC3,LD3,RC3,RD3,RE3,RF3

Unnamed: 0,LA,LB,LC,LD,RC,RD,RE,RF
0,LA0,LB0,LC0,LD0,,,,
1,LA1,LB1,LC1,LD1,,,,
2,LA2,LB2,LC2,LD2,RC2,RD2,RE2,RF2
3,LA3,LB3,LC3,LD3,RC3,RD3,RE3,RF3
4,,,,,RC4,RD4,RE4,RF4
5,,,,,RC5,RD5,RE5,RF5


In [67]:
left.join(right, how = "cross")

Unnamed: 0,LA,LB,LC,LD,RC,RD,RE,RF
0,LA0,LB0,LC0,LD0,RC2,RD2,RE2,RF2
1,LA0,LB0,LC0,LD0,RC3,RD3,RE3,RF3
2,LA0,LB0,LC0,LD0,RC4,RD4,RE4,RF4
3,LA0,LB0,LC0,LD0,RC5,RD5,RE5,RF5
4,LA1,LB1,LC1,LD1,RC2,RD2,RE2,RF2
5,LA1,LB1,LC1,LD1,RC3,RD3,RE3,RF3
6,LA1,LB1,LC1,LD1,RC4,RD4,RE4,RF4
7,LA1,LB1,LC1,LD1,RC5,RD5,RE5,RF5
8,LA2,LB2,LC2,LD2,RC2,RD2,RE2,RF2
9,LA2,LB2,LC2,LD2,RC3,RD3,RE3,RF3


## Merge 

