In [1]:
import pandas as pd

In [2]:
# movies dataset
movies = pd.read_pickle('./dataset/movies/movies.p')
print(movies.shape)
movies.head()

(4803, 4)


Unnamed: 0,id,title,popularity,release_date
0,257,Oliver Twist,20.415572,2005-09-23
1,14290,Better Luck Tomorrow,3.877036,2002-01-12
2,38365,Grown Ups,38.864027,2010-06-24
3,9672,Infamous,3.680896,2006-11-16
4,12819,Alpha and Omega,12.300789,2010-09-17


In [3]:
#taglines dataset
taglines = pd.read_pickle('./dataset/movies/taglines.p')
print(taglines.shape)
taglines.head()

(3955, 2)


Unnamed: 0,id,tagline
0,19995,Enter the World of Pandora.
1,285,"At the end of the world, the adventure begins."
2,206647,A Plan No One Escapes
3,49026,The Legend Ends
4,49529,"Lost in our world, found in another."


## Filter joins
-  semi join
-  anti join

Mutation join vs filter join
- mutation is commbining data from two tables based on matching obsevation in both tables
- filtering observation from table is based on weather or not they match an observation in another table

### 1. semi joins
- return the intersection, similar to an inner join
- return only column from left table and **not** the rigth
- No duplicated

<img src='./media/semi_join.png' width=700 height=800>

- step 1 --> simple inner join for semi join
- step 2 --> making a filter of semi join
- step 3 --> filtering data 

In [4]:
#step1 -->simple inner join for semi join
movies_tag = movies.merge(taglines, on='id')
movies_tag.head()

Unnamed: 0,id,title,popularity,release_date,tagline
0,14290,Better Luck Tomorrow,3.877036,2002-01-12,Never underestimate an overachiever.
1,38365,Grown Ups,38.864027,2010-06-24,Boys will be boys. . . some longer than others.
2,9672,Infamous,3.680896,2006-11-16,There's more to the story than you know
3,12819,Alpha and Omega,12.300789,2010-09-17,A Pawsome 3D Adventure
4,49529,John Carter,43.926995,2012-03-07,"Lost in our world, found in another."


In [5]:
#step 2 --> making a filter of semi join
movies['id'].isin(movies_tag['id'])

0       False
1        True
2        True
3        True
4        True
        ...  
4798     True
4799     True
4800    False
4801     True
4802     True
Name: id, Length: 4803, dtype: bool

In [6]:
# step 3 --> filtering data 
tagged_movies = movies[movies['id'].isin(movies_tag['id'])]
tagged_movies.head()

Unnamed: 0,id,title,popularity,release_date
1,14290,Better Luck Tomorrow,3.877036,2002-01-12
2,38365,Grown Ups,38.864027,2010-06-24
3,9672,Infamous,3.680896,2006-11-16
4,12819,Alpha and Omega,12.300789,2010-09-17
5,49529,John Carter,43.926995,2012-03-07


In [7]:
#semi join in one
movies_tag = movies.merge(taglines, on='id')
tagged_movies = movies[movies['id'].isin(movies_tag['id'])]
tagged_movies.head()

Unnamed: 0,id,title,popularity,release_date
1,14290,Better Luck Tomorrow,3.877036,2002-01-12
2,38365,Grown Ups,38.864027,2010-06-24
3,9672,Infamous,3.680896,2006-11-16
4,12819,Alpha and Omega,12.300789,2010-09-17
5,49529,John Carter,43.926995,2012-03-07


### 2. anti join
- opposite to semi join
- return the left table, **excluding the intersaction**
- return only column from the left **not** from the right

<img src='./media/anti join.png' width= 700 height=800>

- step 1 --> simple left join for anti join
- step 2 --> making a filter of anti join

In [8]:
# step 1 --> simple left join for anti join
movies_tag = movies.merge(taglines, on='id', how='left', indicator=True)
print(movies_tag.shape)
movies_tag.head()

(4803, 6)


Unnamed: 0,id,title,popularity,release_date,tagline,_merge
0,257,Oliver Twist,20.415572,2005-09-23,,left_only
1,14290,Better Luck Tomorrow,3.877036,2002-01-12,Never underestimate an overachiever.,both
2,38365,Grown Ups,38.864027,2010-06-24,Boys will be boys. . . some longer than others.,both
3,9672,Infamous,3.680896,2006-11-16,There's more to the story than you know,both
4,12819,Alpha and Omega,12.300789,2010-09-17,A Pawsome 3D Adventure,both


In [9]:
# step 2 --> making a filter for anti join
id_list = movies_tag.loc[movies_tag['_merge']=='left_only', 'id']
pd.DataFrame(id_list).head()

Unnamed: 0,id
0,257
8,20024
16,2610
17,13074
18,26672


In [10]:
# step 3 --> applying filter
movies_tag = movies.merge(taglines, on='id', how='left', indicator=True)
id_list = movies_tag.loc[movies_tag['_merge']=='left_only', 'id']
non_tagged_movies = movies_tag[movies_tag['id'].isin(id_list)]
non_tagged_movies.head()

Unnamed: 0,id,title,popularity,release_date,tagline,_merge
0,257,Oliver Twist,20.415572,2005-09-23,,left_only
8,20024,The End of the Affair,6.921263,1999-12-03,,left_only
16,2610,Shopgirl,4.820166,2005-10-21,,left_only
17,13074,Resurrecting the Champ,4.898437,2007-06-14,,left_only
18,26672,The Thief and the Cobbler,2.439184,1993-09-23,,left_only


## Concatenate DataFrames together vertically
- pandas **.concat()** can concatenate both vertically and horizentally
- **axis=0** for vertical

<img src='./media/verticaal_concatenation.png' width= 400 height= 500>

In [11]:
jan_movies = movies.iloc[1:5]
jan_movies

Unnamed: 0,id,title,popularity,release_date
1,14290,Better Luck Tomorrow,3.877036,2002-01-12
2,38365,Grown Ups,38.864027,2010-06-24
3,9672,Infamous,3.680896,2006-11-16
4,12819,Alpha and Omega,12.300789,2010-09-17


In [12]:
feb_movies = movies.iloc[11:15]
feb_movies

Unnamed: 0,id,title,popularity,release_date
11,158752,Escape from Tomorrow,1.352222,2013-10-11
12,10956,Joe Dirt,15.976335,2001-04-10
13,2757,Adaptation.,26.441669,2002-12-06
14,308531,Teenage Mutant Ninja Turtles: Out of the Shadows,39.873791,2016-06-01


In [13]:
march_movies = movies.iloc[21:25]
march_movies

Unnamed: 0,id,title,popularity,release_date
21,59,A History of Violence,34.628738,2005-09-23
22,10586,The Ghost and the Darkness,12.46525,1996-10-11
23,55180,The Last Big Thing,0.678475,1998-09-23
24,157058,30 Nights of Paranormal Activity With the Devi...,1.737032,2013-01-14


In [14]:
#basic concatenation
pd.concat([jan_movies,feb_movies,march_movies])

Unnamed: 0,id,title,popularity,release_date
1,14290,Better Luck Tomorrow,3.877036,2002-01-12
2,38365,Grown Ups,38.864027,2010-06-24
3,9672,Infamous,3.680896,2006-11-16
4,12819,Alpha and Omega,12.300789,2010-09-17
11,158752,Escape from Tomorrow,1.352222,2013-10-11
12,10956,Joe Dirt,15.976335,2001-04-10
13,2757,Adaptation.,26.441669,2002-12-06
14,308531,Teenage Mutant Ninja Turtles: Out of the Shadows,39.873791,2016-06-01
21,59,A History of Violence,34.628738,2005-09-23
22,10586,The Ghost and the Darkness,12.46525,1996-10-11


In [15]:
# Ignoring the index
pd.concat([jan_movies,feb_movies,march_movies], ignore_index=True)

Unnamed: 0,id,title,popularity,release_date
0,14290,Better Luck Tomorrow,3.877036,2002-01-12
1,38365,Grown Ups,38.864027,2010-06-24
2,9672,Infamous,3.680896,2006-11-16
3,12819,Alpha and Omega,12.300789,2010-09-17
4,158752,Escape from Tomorrow,1.352222,2013-10-11
5,10956,Joe Dirt,15.976335,2001-04-10
6,2757,Adaptation.,26.441669,2002-12-06
7,308531,Teenage Mutant Ninja Turtles: Out of the Shadows,39.873791,2016-06-01
8,59,A History of Violence,34.628738,2005-09-23
9,10586,The Ghost and the Darkness,12.46525,1996-10-11


In [16]:
# Setting labels to original tables
pd.concat([jan_movies,feb_movies,march_movies], ignore_index=False, keys=['jan', 'feb', 'mar'])

Unnamed: 0,Unnamed: 1,id,title,popularity,release_date
jan,1,14290,Better Luck Tomorrow,3.877036,2002-01-12
jan,2,38365,Grown Ups,38.864027,2010-06-24
jan,3,9672,Infamous,3.680896,2006-11-16
jan,4,12819,Alpha and Omega,12.300789,2010-09-17
feb,11,158752,Escape from Tomorrow,1.352222,2013-10-11
feb,12,10956,Joe Dirt,15.976335,2001-04-10
feb,13,2757,Adaptation.,26.441669,2002-12-06
feb,14,308531,Teenage Mutant Ninja Turtles: Out of the Shadows,39.873791,2016-06-01
mar,21,59,A History of Violence,34.628738,2005-09-23
mar,22,10586,The Ghost and the Darkness,12.46525,1996-10-11


In [17]:
jan_tags = taglines.iloc[1:5]
jan_tags

Unnamed: 0,id,tagline
1,285,"At the end of the world, the adventure begins."
2,206647,A Plan No One Escapes
3,49026,The Legend Ends
4,49529,"Lost in our world, found in another."


In [18]:
# Concatenate tables with different column names
pd.concat([jan_movies,jan_tags], sort=True) #<-- sorting column name

Unnamed: 0,id,popularity,release_date,tagline,title
1,14290,3.877036,2002-01-12,,Better Luck Tomorrow
2,38365,38.864027,2010-06-24,,Grown Ups
3,9672,3.680896,2006-11-16,,Infamous
4,12819,12.300789,2010-09-17,,Alpha and Omega
1,285,,,"At the end of the world, the adventure begins.",
2,206647,,,A Plan No One Escapes,
3,49026,,,The Legend Ends,
4,49529,,,"Lost in our world, found in another.",


In [19]:
pd.concat([jan_movies,jan_tags], sort=False) #<-- without sorting column names bydefault False

Unnamed: 0,id,title,popularity,release_date,tagline
1,14290,Better Luck Tomorrow,3.877036,2002-01-12,
2,38365,Grown Ups,38.864027,2010-06-24,
3,9672,Infamous,3.680896,2006-11-16,
4,12819,Alpha and Omega,12.300789,2010-09-17,
1,285,,,,"At the end of the world, the adventure begins."
2,206647,,,,A Plan No One Escapes
3,49026,,,,The Legend Ends
4,49529,,,,"Lost in our world, found in another."


In [20]:
# Concatenate tables with different column names
pd.concat([jan_movies, jan_tags],join='inner')#<-- applying inner join on columns by default outer

Unnamed: 0,id
1,14290
2,38365
3,9672
4,12819
1,285
2,206647
3,49026
4,49529


### Using append method
**.append()**
- Simplified version of **.concat()**
- suppor : **sort_index** and **sort**
- Not support : **keys** and **join** i:e. always **join == outer**
    

In [21]:
jan_movies.append([feb_movies,march_movies], ignore_index=True, sort=True)

Unnamed: 0,id,popularity,release_date,title
0,14290,3.877036,2002-01-12,Better Luck Tomorrow
1,38365,38.864027,2010-06-24,Grown Ups
2,9672,3.680896,2006-11-16,Infamous
3,12819,12.300789,2010-09-17,Alpha and Omega
4,158752,1.352222,2013-10-11,Escape from Tomorrow
5,10956,15.976335,2001-04-10,Joe Dirt
6,2757,26.441669,2002-12-06,Adaptation.
7,308531,39.873791,2016-06-01,Teenage Mutant Ninja Turtles: Out of the Shadows
8,59,34.628738,2005-09-23,A History of Violence
9,10586,12.46525,1996-10-11,The Ghost and the Darkness


## Verifying integrity

<img src= './media/verfying_integrity.png'>

## Validating merges
**.merge(validate=None)**
- check if merge is not specified type
    - 'one to one'
    - 'one to many'
    - 'many to one'
    - 'many to many'

In [22]:
# lets check it on movies and taglines
print(movies.merge(taglines , on='id', validate='one_to_one').shape)
movies.merge(taglines , on='id', validate='one_to_one').head()

(3955, 5)


Unnamed: 0,id,title,popularity,release_date,tagline
0,14290,Better Luck Tomorrow,3.877036,2002-01-12,Never underestimate an overachiever.
1,38365,Grown Ups,38.864027,2010-06-24,Boys will be boys. . . some longer than others.
2,9672,Infamous,3.680896,2006-11-16,There's more to the story than you know
3,12819,Alpha and Omega,12.300789,2010-09-17,A Pawsome 3D Adventure
4,49529,John Carter,43.926995,2012-03-07,"Lost in our world, found in another."


if one possible we'll get below error

**Traceback (most recent call last):<br>
MergeError: Merge keys are not unique in right dataset; not a one-to-one merge**

## Verifying concatenations
**.concat(verify_integrity=False)** :
- Check whether the new concatenated index contains duplicates
- Default value is **False**

In [23]:
pd.concat([jan_movies,feb_movies], verify_integrity=False)

Unnamed: 0,id,title,popularity,release_date
1,14290,Better Luck Tomorrow,3.877036,2002-01-12
2,38365,Grown Ups,38.864027,2010-06-24
3,9672,Infamous,3.680896,2006-11-16
4,12819,Alpha and Omega,12.300789,2010-09-17
11,158752,Escape from Tomorrow,1.352222,2013-10-11
12,10956,Joe Dirt,15.976335,2001-04-10
13,2757,Adaptation.,26.441669,2002-12-06
14,308531,Teenage Mutant Ninja Turtles: Out of the Shadows,39.873791,2016-06-01


In [24]:
duplicate_jan_movies = movies.iloc[1:5]
duplicate_feb_movies = movies.iloc[4:5]

In [25]:
pd.concat([duplicate_jan_movies,duplicate_feb_movies], verify_integrity=False)

Unnamed: 0,id,title,popularity,release_date
1,14290,Better Luck Tomorrow,3.877036,2002-01-12
2,38365,Grown Ups,38.864027,2010-06-24
3,9672,Infamous,3.680896,2006-11-16
4,12819,Alpha and Omega,12.300789,2010-09-17
4,12819,Alpha and Omega,12.300789,2010-09-17


In [None]:
#<-- Give Error because integrity is true to chk duplicated
pd.concat([duplicate_jan_movies,duplicate_feb_movies], verify_integrity=True) 

# Practice

### Task1

#### Required datasets

In [27]:
employees = pd.read_csv('./employees.csv')
employees.head()

Unnamed: 0,srid,lname,fname,title,hire_date,email
0,1,Adams,Andrew,General Manager,2002-08-14,andrew@chinookcorp.com
1,2,Edwards,Nancy,Sales Manager,2002-05-01,nancy@chinookcorp.com
2,3,Peacock,Jane,Sales Support Agent,2002-04-01,jane@chinookcorp.com
3,4,Park,Margaret,Sales Support Agent,2003-05-03,argaret@chinookcorp.com
4,5,Johnson,Steve,Sales Support Agent,2003-10-17,steve@chinookcorp.com


In [28]:
top_cust = pd.read_csv('./top_cust.csv')
top_cust.head()

Unnamed: 0,cid,srid,fname,lname,phone,fax,email
0,1,3,Luís,Gonçalves,+55 (12) 3923-5555,+55 (12) 3923-5566,luisg@embraer.com.br
1,2,5,Leonie,Köhler,+49 0711 2842222,,leonekohler@surfeu.de
2,3,3,François,Tremblay,+1 (514) 721-4711,,ftremblay@gmail.com
3,4,4,Bjørn,Hansen,+47 22 44 22 22,,bjorn.hansen@yahoo.no
4,5,4,František,Wichterlová,+420 2 4172 5555,+420 2 4172 5555,frantisekw@jetbrains.com


####  requirements
- Merge employees and top_cust with a left join, setting indicator argument to True. Save the result to empl_cust.
- Select the srid column of empl_cust and the rows where _merge is 'left_only'. Save the result to srid_list.
- Subset the employees table and select those rows where the srid is in the variable srid_list and print the results.

In [29]:
# Merge employees and top_cust
empl_cust = employees.merge(top_cust, on='srid', 
                                 how='left', indicator=True)

# Select the srid column where _merge is left_only
srid_list = empl_cust.loc[empl_cust['_merge'] == 'left_only', 'srid']

# Get employees not working with top customers
employees[employees['srid'].isin(srid_list)]

Unnamed: 0,srid,lname,fname,title,hire_date,email
0,1,Adams,Andrew,General Manager,2002-08-14,andrew@chinookcorp.com
1,2,Edwards,Nancy,Sales Manager,2002-05-01,nancy@chinookcorp.com
5,6,Mitchell,Michael,IT Manager,2003-10-17,michael@chinookcorp.com
6,7,King,Robert,IT Staff,2004-01-02,robert@chinookcorp.com
7,8,Callahan,Laura,IT Staff,2004-03-04,laura@chinookcorp.com


### Task2

#### The required datasets

In [30]:
non_mus_tcks = pd.read_csv('./non_musk_tcks.csv')
non_mus_tcks.head()

Unnamed: 0,tid,name,aid,mtid,gid,u_price
0,2820,Occupation / Precipice,227,3,19.0,1.99
1,2821,Exodus Pt.1,227,3,19.0,1.99
2,2822,Exodus Pt.2,227,3,19.0,1.99
3,2823,Collaborators,227,3,19.0,1.99
4,2824,Torn,227,3,19.0,1.99


In [31]:
top_invoices = pd.read_csv('./top_invoices.csv')
top_invoices.head()

Unnamed: 0,ilid,iid,tid,uprice,quantity
0,470,88,2832,1.99,1
1,473,88,2850,1.99,1
2,476,88,2868,1.99,1
3,527,96,3214,1.99,1
4,528,96,3223,1.99,1


In [32]:
genres = pd.read_csv('./genres.csv')
genres.head()

Unnamed: 0,gid,name
0,1,Rock
1,2,Jazz
2,3,Metal
3,4,Alternative & Punk
4,5,Rock And Roll


####  The required tasks
- Merge non_mus_tcks and top_invoices on tid using an inner join. Save the result as tracks_invoices.
- Use .isin() to subset the rows of non_mus_tck where tid is in the tid column of tracks_invoices. Save the result as top_tracks.
- Group top_tracks by gid and count the tid rows. Save the result to cnt_by_gid.
- Merge cnt_by_gid with the genres table on gid and print the result.

In [35]:
non_mus_tcks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   tid      60 non-null     object 
 1   name     60 non-null     object 
 2   aid      60 non-null     int64  
 3   mtid     60 non-null     int64  
 4   gid      60 non-null     float64
 5   u_price  59 non-null     float64
dtypes: float64(2), int64(2), object(2)
memory usage: 2.9+ KB


In [36]:
top_invoices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ilid      16 non-null     int64  
 1   iid       16 non-null     int64  
 2   tid       16 non-null     int64  
 3   uprice    16 non-null     float64
 4   quantity  16 non-null     int64  
dtypes: float64(1), int64(4)
memory usage: 768.0 bytes


In [66]:
def numbers(x):
    try:
        x = str(x)
        return "".join([i for i in x if str.isnumeric(i)])
    except:
        return 0
non_mus_tcks.tid.apply(numbers).head()

0    2820
1    2821
2    2822
3    2823
4    2824
Name: tid, dtype: object

In [46]:
import numpy as np
non_mus_tcks['tid'] = non_mus_tcks['tid'].apply(numbers)
non_mus_tcks['tid'] = non_mus_tcks['tid'].apply(np.int64)

In [48]:
# Merge the non_mus_tck and top_invoices tables on tid
tracks_invoices = non_mus_tcks.merge(top_invoices, on='tid')

# Use .isin() to subset non_mus_tcsk to rows with tid in tracks_invoices
top_tracks = non_mus_tcks[non_mus_tcks['tid'].isin(tracks_invoices['tid'])]

# Group the top_tracks by gid and count the tid rows
cnt_by_gid = top_tracks.groupby(['gid'], as_index=False).agg({'tid':'count'})

# Merge the genres table to cnt_by_gid on gid and print
cnt_by_gid.merge(genres, on='gid')



Unnamed: 0,gid,tid,name
0,19.0,1,TV Shows


### Task3

#### required datasets

In [49]:
tracks_master = pd.read_csv('./tracks_master.csv')
tracks_master.head()

Unnamed: 0,tid,name,aid,mtid,gid,composer,u_price
0,1853,Battery,152,1,3,J.Hetfield/L.Ulrich,0.99
1,1854,Master Of Puppets,152,1,3,K.Hammett,0.99
2,1857,Disposable Heroes,152,1,3,J.Hetfield/L.Ulrich,0.99


In [50]:
tracks_ride = pd.read_csv('./tracks_ride.csv')
tracks_ride.head()

Unnamed: 0,tid,name,aid,mtid,gid,u_price
0,1874,Fight Fire With Fire,154,1,3,0.99
1,1875,Ride The Lightning,154,1,3,0.99
2,1876,For Whom The Bell Tolls,154,1,3,0.99
3,1877,Fade To Black,154,1,3,0.99
4,1878,Trapped Under Ice,154,1,3,0.99


In [51]:
tracks_st = pd.read_csv('./tracks_st.csv')
tracks_st.head()

Unnamed: 0,tid,name,aid,mtid,gid,u_price
0,1882,Frantic,155,1,3,0.99
1,1883,St. Anger,155,1,3,0.99
2,1884,Some Kind Of Monster,155,1,3,0.99
3,1885,Dirty Window,155,1,3,0.99
4,1886,Invisible Kid,155,1,3,0.99


####  required tasks
- Concatenate tracks_master, tracks_ride, and tracks_st, in that order, setting sort to True.
- Concatenate tracks_master, tracks_ride, and tracks_st, where the index goes from 0 to n-1.
- Concatenate tracks_master, tracks_ride, and tracks_st, showing only columns that are in all tables.

In [52]:
# Concatenate the tracks
tracks_from_albums = pd.concat([tracks_master,tracks_ride,tracks_st],
                               sort=True)
tracks_from_albums.head()

Unnamed: 0,aid,composer,gid,mtid,name,tid,u_price
0,152,J.Hetfield/L.Ulrich,3,1,Battery,1853,0.99
1,152,K.Hammett,3,1,Master Of Puppets,1854,0.99
2,152,J.Hetfield/L.Ulrich,3,1,Disposable Heroes,1857,0.99
0,154,,3,1,Fight Fire With Fire,1874,0.99
1,154,,3,1,Ride The Lightning,1875,0.99


In [53]:
# Concatenate the tracks so the index goes from 0 to n-1
tracks_from_albums = pd.concat([tracks_master, tracks_ride, tracks_st],
                               ignore_index = True,
                               sort=True)
tracks_from_albums.head()

Unnamed: 0,aid,composer,gid,mtid,name,tid,u_price
0,152,J.Hetfield/L.Ulrich,3,1,Battery,1853,0.99
1,152,K.Hammett,3,1,Master Of Puppets,1854,0.99
2,152,J.Hetfield/L.Ulrich,3,1,Disposable Heroes,1857,0.99
3,154,,3,1,Fight Fire With Fire,1874,0.99
4,154,,3,1,Ride The Lightning,1875,0.99


In [54]:
# Concatenate the tracks, show only columns names that are in all tables
tracks_from_albums = pd.concat([tracks_master, tracks_ride, tracks_st],join= 'inner', sort=True)
tracks_from_albums.head()

Unnamed: 0,aid,gid,mtid,name,tid,u_price
0,152,3,1,Battery,1853,0.99
1,152,3,1,Master Of Puppets,1854,0.99
2,152,3,1,Disposable Heroes,1857,0.99
0,154,3,1,Fight Fire With Fire,1874,0.99
1,154,3,1,Ride The Lightning,1875,0.99


### Task4

#### required datasets

In [55]:
inv_jul = pd.read_csv('./inv_jul.csv')
inv_jul.head()

Unnamed: 0,iid,cid,invoice_date,total,bill_ctry
0,42,51,2009-07-06,1.98,Sweden
1,43,53,2009-07-06,1.98,UK
2,44,55,2009-07-07,3.96,Australia
3,45,59,2009-07-08,5.94,India
4,46,6,2009-07-11,8.91,Czech Republic


In [56]:
inv_aug = pd.read_csv('./inv_aug.csv')
inv_aug.head()

Unnamed: 0,iid,cid,invoice_date,total,bill_ctry
0,49,30,2009-08-06,1.98,Canada
1,50,32,2009-08-06,1.98,Canada
2,51,34,2009-08-07,3.96,Portugal
3,52,38,2009-08-08,5.94,Germany
4,53,44,2009-08-11,8.91,Finland


In [57]:
inv_sep = pd.read_csv('./inv_sep.csv')
inv_sep.head()

Unnamed: 0,iid,cid invoice_date,total,bill_ctry
56,9,2009-09-06,1.98,Denmark
57,11,2009-09-06,1.98,Brazil
58,13,2009-09-07,3.96,Brazil
59,17,2009-09-08,5.94,USA
60,23,2009-09-11,8.91,USA


- Concatenate the three tables together vertically in order with the oldest month first, adding '7Jul', '8Aug', and '9Sep' as keys for their respective months, and save to variable avg_inv_by_month.
- Use the .agg() method to find the average of the total column from the grouped invoices.
- Create a bar chart of avg_inv_by_month.

In [58]:
# Concatenate the tables and add keys
inv_jul_thr_sep = pd.concat([inv_jul, inv_aug, inv_sep], 
                            keys=['7Jul', '8Aug', '9Sep'])
inv_jul_thr_sep

Unnamed: 0,Unnamed: 1,iid,cid,invoice_date,total,bill_ctry,cid invoice_date
7Jul,0,42,51.0,2009-07-06,1.98,Sweden,
7Jul,1,43,53.0,2009-07-06,1.98,UK,
7Jul,2,44,55.0,2009-07-07,3.96,Australia,
7Jul,3,45,59.0,2009-07-08,5.94,India,
7Jul,4,46,6.0,2009-07-11,8.91,Czech Republic,
...,...,...,...,...,...,...,...
9Sep,387,29,,,3.96,Canada,2013-09-03
9Sep,388,33,,,5.94,Canada,2013-09-04
9Sep,389,39,,,8.91,France,2013-09-07
9Sep,390,48,,,13.86,Netherlands,2013-09-12


In [60]:
# inv_jul_thr_sep['total']=inv_jul_thr_sep['total'].astype(float)

In [None]:
inv_jul_thr_sep['total'] = inv_jul_thr_sep['total'].apply(numbers)
inv_jul_thr_sep['total'] = inv_jul_thr_sep['total'].apply(np.int64)

In [None]:
# Group the invoices by the index keys and find avg of the total column
avg_inv_by_month = inv_jul_thr_sep.groupby(level=0).agg({'total':'mean'})

# Bar plot of avg_inv_by_month
avg_inv_by_month.plot(kind='bar')
plt.show()

### Task5

#### Required tables

In [None]:
artists = pd.read_csv('./artist.csv')
artists.head()

In [None]:
albums = pd.read_csv('./album.csv')
albums.head()

- You have been given 2 tables, artists, and albums. Use the console to merge them using artists.merge(albums, on='artid').head(). Adjust the validate argument to answer which statement is False.

1- You can use 'many_to_many' without an error, since there is a duplicate key in one of the tables.

2- You can use 'one_to_many' without error, since there is a duplicate key in the right table.

3- You can use 'many_to_one' without an error, since there is a duplicate key in the left table.

In [None]:
# artists.merge(albums, on='artid').head()

In [None]:
# artists.merge(albums, on='artid', validate = 'one_to_many').head()

### Task6

#### required file

In [None]:
classic_18 = pd.read_csv('./classic_18.csv')
classic_18.head()

In [None]:
classic_19 = pd.read_csv('./classic_19.csv')
classic_19.head()

In [None]:
pop_18 = pd.read_csv('./pop_18.csv')
pop_18.head()

In [None]:
pop_19 = pd.read_csv('./pop_19.csv')
pop_19.head()

- Concatenate the classic_18 and classic_19 tables vertically where the index goes from 0 to n-1, and save to classic_18_19.
- Concatenate the pop_18 and pop_19 tables vertically where the index goes from 0 to n-1, and save to pop_18_19.
- With classic_18_19 on the left, merge it with pop_18_19 on tid using an inner join.
- Use .isin() to filter classic_18_19 where tid is in classic_pop.

In [None]:
# Concatenate the classic tables vertically
classic_18_19 = pd.concat([classic_18, classic_19], ignore_index=True)

# Concatenate the pop tables vertically
pop_18_19 = pd.concat([pop_18, pop_19], ignore_index=True)

# Merge classic_18_19 with pop_18_19
classic_pop = classic_18_19.merge(pop_18_19, on='tid')

# Using .isin(), filter classic_18_19 rows where tid is in classic_pop
popular_classic = classic_18_19[classic_18_19['tid'].isin(classic_pop['tid'])]

# Print popular chart
print(popular_classic)