In [512]:
import glob
import os
import pandas as pd
import psycopg2
from sql_queries import *

In [673]:
songplay_insert = """ INSERT INTO songplays (songplay_id, start_time, user_id,
                        level, song_id, artist_id, session_id, location, user_agent)
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
                        """

user_insert = """ INSERT INTO users (user_id, first_name, last_name, gender, level)
                    VALUES (%s, %s, %s, %s, %s)
                    """

songs_insert = """ INSERT INTO songs  (song_id, title, artist_id, year, duration)
                    VALUES (%s, %s, %s, %s, %s)"""

artist_insert = """ INSERT INTO artists  (artist_id, name, location, latitude, longitude)
                    VALUES (%s, %s, %s, %s, %s)"""


time_insert = """ INSERT INTO time (start_time, hour, day, week, month, year, weekday)
                    VALUES (%s, %s, %s, %s, %s, %s, %s)"""


In [674]:
temp_song_artist = """
                   DROP TABLE IF EXISTS temp_song_artist;
                   SELECT songs.song_id as song_id, songs.title as title, songs.duration as duration,
                   artists.name as name, artists.artist_id as artist_id
                   INTO temp_song_artist
                   FROM songs
                   JOIN artists
                   ON songs.artist_id = artists.artist_id
                   """

song_select = (""" SELECT song_id, artist_id
                   FROM temp_song_artist
                   WHERE title = %s and name = %s and duration = %s
               """)

#### Connect to database

In [675]:
try:
    conn = psycopg2.connect(dbname="song", user="postgres", password="123456")
except psycopg2.Error as e:
    print(e)
    
try:
    cur = conn.cursor()
except psycopg2.Error as e:
    print(e)

#### Get file 

In [676]:
def get_files(filepath):
    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root,'*.json'))
        for f in files :
            all_files.append(os.path.abspath(f))
    
    return all_files

## Process song data

In [677]:
song_files = get_files("data/song_data")
song_path = song_files[0]

df = pd.read_json(song_path, lines = True)
df.head()

Unnamed: 0,num_songs,artist_id,artist_latitude,artist_longitude,artist_location,artist_name,song_id,title,duration,year
0,1,ARD7TVE1187B99BFB1,,,California - LA,Casual,SOMZWCG12A8C13C480,I Didn't Mean To,218.93179,0


## #1: Song table

- Extract Data for Songs Table
- Select columns for song ID, title, artist ID, year, and duration
- Use df.values to select just the values from the dataframe
- Index to select the first (only) record in the dataframe
- Convert the array to a list and set it to song_data

##### Extract file

In [678]:
def extract_json(file):
    dataframe = pd.read_json(file, lines = True)
    return dataframe

In [679]:
# extract function
# print(len(song_files)) -> 71 
def extract():
    # create empty dataframe
    df_list = pd.DataFrame()
    
    # put data to df
    for i in song_files:
        df_list = df_list.append(extract_json(i), ignore_index = True)
    return df_list

In [680]:
extract().head(5)

Unnamed: 0,num_songs,artist_id,artist_latitude,artist_longitude,artist_location,artist_name,song_id,title,duration,year
0,1,ARD7TVE1187B99BFB1,,,California - LA,Casual,SOMZWCG12A8C13C480,I Didn't Mean To,218.93179,0
1,1,ARMJAGH1187FB546F3,35.14968,-90.04892,"Memphis, TN",The Box Tops,SOCIWDW12A8C13D406,Soul Deep,148.03546,1969
2,1,ARKRRTF1187B9984DA,,,,Sonora Santanera,SOXVLOJ12AB0189215,Amor De Cabaret,177.47546,0
3,1,AR7G5I41187FB4CE6C,,,"London, England",Adam Ant,SONHOTT12A8C13493C,Something Girls,233.40363,1982
4,1,ARXR32B1187FB57099,,,,Gob,SOFSOCN12A8C143F5D,Face the Ashes,209.60608,2007


##### Transform

In [681]:
df_song = extract()
data_song = df_song[['song_id','title','artist_id','year', 'duration']]
data_song.head()


Unnamed: 0,song_id,title,artist_id,year,duration
0,SOMZWCG12A8C13C480,I Didn't Mean To,ARD7TVE1187B99BFB1,0,218.93179
1,SOCIWDW12A8C13D406,Soul Deep,ARMJAGH1187FB546F3,1969,148.03546
2,SOXVLOJ12AB0189215,Amor De Cabaret,ARKRRTF1187B9984DA,0,177.47546
3,SONHOTT12A8C13493C,Something Girls,AR7G5I41187FB4CE6C,1982,233.40363
4,SOFSOCN12A8C143F5D,Face the Ashes,ARXR32B1187FB57099,2007,209.60608


##### Load into database

In [682]:
# try:
#     for index,row in data_song.iterrows():
#         cur.execute(songs_insert,(row.song_id, row.title, row.artist_id, row.year, row.duration))
#         conn.commit()
# except psycopg2.Error as e:
#     print(e)

## #2. Artist table

- Select columns for artist ID, name, location, latitude, and longitude
- Use df.values to select just the values from the dataframe
- Index to select the first (only) record in the dataframe
- Convert the array to a list and set it to artist_data

#### Transform

In [683]:
data_artist = df_song[['artist_id','artist_name','artist_location','artist_latitude', 'artist_longitude']]
data_artist.head()

Unnamed: 0,artist_id,artist_name,artist_location,artist_latitude,artist_longitude
0,ARD7TVE1187B99BFB1,Casual,California - LA,,
1,ARMJAGH1187FB546F3,The Box Tops,"Memphis, TN",35.14968,-90.04892
2,ARKRRTF1187B9984DA,Sonora Santanera,,,
3,AR7G5I41187FB4CE6C,Adam Ant,"London, England",,
4,ARXR32B1187FB57099,Gob,,,


In [684]:
data_artist = data_artist.drop_duplicates(subset = ["artist_id"])
print(len(data_artist))

69


#### Load into database

In [685]:
# try:
#     for index, row in data_artist.iterrows():
#         cur.execute(artist_insert,(row.artist_id, row.artist_name, 
#                                    row.artist_location, row.artist_latitude, row.artist_longitude))
#         conn.commit()
# except psycopg2.Error as e:
#     print(e)

        

## Process log_data

In [686]:
log_files = get_files("data/log_data")
log_path = log_files[0]
df_log = pd.read_json(log_path, lines=True)
df_log.head(1)

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1540919166796,38,,200,1541105830796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",39


### Extract

In [687]:
def extract_log():
    dataframe = pd.DataFrame()
    
    for i in log_files:
        dataframe = dataframe.append(extract_json(i),ignore_index = True)
    return dataframe
df_log = extract_log()
df_log.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1540919000000.0,38,,200,1541105830796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",39
1,,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1540345000000.0,139,,200,1541106106796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
2,Des'ree,Logged In,Kaylee,F,1,Summers,246.30812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540345000000.0,139,You Gotta Be,200,1541106106796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
3,,Logged In,Kaylee,F,2,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Upgrade,1540345000000.0,139,,200,1541106132796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
4,Mr Oizo,Logged In,Kaylee,F,3,Summers,144.03873,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540345000000.0,139,Flat 55,200,1541106352796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8


In [688]:
print(len(df_log))

8056


## #3. Time table

- Filter records by NextSong action
- Convert the ts timestamp column to datetime
- Hint: the current timestamp is in milliseconds
- Extract the timestamp, hour, day, week of year, month, year, and weekday from the ts column and set time_data to a list containing these values in order
- Hint: use pandas' dt attribute to access easily datetimelike properties.
- Specify labels for these columns and set to column_labels
- Create a dataframe, time_df, containing the time data for this file by combining column_labels and time_data into a dictionary and converting this into a dataframe

### Transform

In [689]:
df_log = df_log[df_log['page']=='NextSong']
df_log.head()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
2,Des'ree,Logged In,Kaylee,F,1,Summers,246.30812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540345000000.0,139,You Gotta Be,200,1541106106796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
4,Mr Oizo,Logged In,Kaylee,F,3,Summers,144.03873,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540345000000.0,139,Flat 55,200,1541106352796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
5,Tamba Trio,Logged In,Kaylee,F,4,Summers,177.18812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540345000000.0,139,Quem Quiser Encontrar O Amor,200,1541106496796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
6,The Mars Volta,Logged In,Kaylee,F,5,Summers,380.42077,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540345000000.0,139,Eriatarka,200,1541106673796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
7,Infected Mushroom,Logged In,Kaylee,F,6,Summers,440.2673,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540345000000.0,139,Becoming Insane,200,1541107053796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8


In [690]:
t = pd.to_datetime(df_log['ts'], unit='ms')

df_time = pd.DataFrame()
df_time['start_time'] = t.dt.time
df_time.head()

Unnamed: 0,start_time
2,21:01:46.796000
4,21:05:52.796000
5,21:08:16.796000
6,21:11:13.796000
7,21:17:33.796000


In [691]:
df_time['hour'] = t.dt.hour
df_time['day']= t.dt.day
df_time['weekofyear']= t.dt.isocalendar().week
df_time['month'] = t.dt.month
df_time['year'] = t.dt.year
df_time['weekday'] = t.dt.weekday
df_time = df_time.drop_duplicates(subset='start_time')
df_time.head()

Unnamed: 0,start_time,hour,day,weekofyear,month,year,weekday
2,21:01:46.796000,21,1,44,11,2018,3
4,21:05:52.796000,21,1,44,11,2018,3
5,21:08:16.796000,21,1,44,11,2018,3
6,21:11:13.796000,21,1,44,11,2018,3
7,21:17:33.796000,21,1,44,11,2018,3


### Load into databse

In [692]:
# try:
#     for index, row in df_time.iterrows():
#         cur.execute(time_insert, (row.start_time, row.hour, row.day, row.weekofyear, row.month, row.year, row.weekday))
#         conn.commit()
# except psycopg2.Error as e:
#     print(e)

## #4: users Table

In [693]:
df_user = df_log[['userId','firstName', 'lastName', 'gender','level']]
df_user.head()

Unnamed: 0,userId,firstName,lastName,gender,level
2,8,Kaylee,Summers,F,free
4,8,Kaylee,Summers,F,free
5,8,Kaylee,Summers,F,free
6,8,Kaylee,Summers,F,free
7,8,Kaylee,Summers,F,free


In [694]:
# drop duplicate value
df_user.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6820 entries, 2 to 8055
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     6820 non-null   object
 1   firstName  6820 non-null   object
 2   lastName   6820 non-null   object
 3   gender     6820 non-null   object
 4   level      6820 non-null   object
dtypes: object(5)
memory usage: 319.7+ KB


In [695]:
df_user['userId'] = df_user['userId'].astype(int)
df_user = df_user.sort_values(by='userId')
df_user = df_user.drop_duplicates(subset='userId')
df_user.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user['userId'] = df_user['userId'].astype(int)


Unnamed: 0,userId
count,96.0
mean,51.65625
std,29.058862
min,2.0
25%,26.75
50%,52.5
75%,76.25
max,101.0


In [696]:
df_user.head(10)

Unnamed: 0,userId,firstName,lastName,gender,level
967,2,Jizelle,Benjamin,F,free
185,3,Isaac,Valdez,M,free
7344,4,Alivia,Terrell,F,free
8055,5,Elijah,Davis,M,free
5573,6,Cecilia,Owens,F,free
1849,7,Adelyn,Jordan,F,free
6578,8,Kaylee,Summers,F,free
1105,9,Wyatt,Scott,M,free
4336,10,Sylvie,Cruz,F,free
1290,11,Christian,Porter,F,free


In [697]:
# try:
#     for index, row in df_user.iterrows():
#         cur.execute(user_insert, list(row))
#         conn.commit()
# except psycopg2.Error as e: 
#     print(e)

## #5. Songplays

In [698]:
# # create temp song table
# try:
#     cur.execute(temp_song_artist)
#     conn.commit()
# except psycopg2.Error as e:
#     print(e)

In [702]:
# for index, row in df_log.iterrows():
#     # get songid and artistid from song and artist tables
#     cur.execute(song_select, (row.song, row.artist, row.length))
#     results = cur.fetchone()

#     if results:
#         songid, artistid = results
#     else:
#         songid, artistid = None, None
#     time = pd.to_datetime(row['ts'], unit='ms')
#     # insert songplay record
#     cur.execute(songplay_insert, (index,time,row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent))
#     conn.commit()