# PROJECT 1: EXPLORATORY DATA ANALYSIS

## Project Name: 

SOUNDCLOUD ANALYSIS

## Assignment Table

| ID            | Fullname              | Contribution rate |
| :-----------: | :-------------------  | :-----------:     |
| 20127323      | Võ Nhật Tân           |                   |
| 20127447      | Ngô Đức Bảo           |                   |
| 20127275      | Lê Nguyễn Nhật Phú    |                   |
| 20127681      | Nguyễn Thiên Phúc     |                   |



# Import Libraries

In [1]:
import numpy as np
import time
import pandas as pd
import random
import requests
import requests_cache
from bs4 import BeautifulSoup
import os.path
from concurrent.futures import ThreadPoolExecutor, process, wait

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [3]:
requests_cache.install_cache('cache', expire_after=None)

In [4]:
SLEEP_TIME = 2
SCROLL_PAUSE_TIME = 0.5

## Data Collection

In [137]:
def asynchronousGetWebData(listData, getDataFunc, *argv, max_screens = 6,):
    '''
    Input:
        - listData: list of urls which need to parse the content to the useful information 
        - getDataFunc - func: Fucntion supports parse each item in listData to get information
            * First parameter must be the item in list data
            * The next parameter is saved in *argv
        - max_screens - int: Maximum screens that open at the same time
        
    Output:
        - Returns list of result data
    '''
    result_value = []
    
    for item_index in range(0, len(listData), max_screens):
        start = item_index
        end = item_index + max_screens
        if end > len(listData):
            end = len(listData)
            
        tmpData = listData[start:end]
        threadList = []
        with ThreadPoolExecutor() as executor:
            for url in tmpData:
                threadList.append(executor.submit(getDataFunc, url, *argv))
                
        wait(threadList)
        
        for thread in threadList:
            if thread.done() and not thread.cancelled():
                result_value.append(thread.result())
    return result_value
    

### Collect Users

In [138]:
urlList = [ 'https://soundcloud.com/user-593335594/sets/is-500-songs-the-playlist', \
            'https://soundcloud.com/user-593335594/sets/amazing-artistry-2',\
            'https://soundcloud.com/user-593335594/sets/playlist-that-wont-crash-my',\
            'https://soundcloud.com/user-593335594/sets/my-personal-favorites',\
            'https://soundcloud.com/user-593335594/sets/amazing',\
            'https://soundcloud.com/user-593335594/sets/i-found-more-music',\
            'https://soundcloud.com/user-593335594/sets/chivalry-is-dead',\
            'https://soundcloud.com/idla/sets/old-songs',\
            'https://soundcloud.com/idla/sets/pop-playlist-2020',\
            'https://soundcloud.com/idla/sets/lost-in-the-blues',\
            'https://soundcloud.com/user987150052/sets/hot-playlist',\
            'https://soundcloud.com/user61185041/sets/hot-playlist',\
            'https://soundcloud.com/discover/sets/charts-top:all-music:vn',\
            'https://soundcloud.com/discover/sets/charts-trending:danceedm:vn',\
            'https://soundcloud.com/bytufekci/sets/top-100-songs-of-2020']

In [139]:
def getAllUserURL(url):
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get(url)
    userProfileUrl = set()
    try:
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))).click()
    except:
        pass

    #Scroll to the bottom
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'trackItem__username')))
    newUserUrl = driver.find_elements(By.CLASS_NAME, 'trackItem__username')
    for user in newUserUrl:
        userProfileUrl.add(user.get_attribute('href'))
    driver.close()
    return userProfileUrl

In [140]:
userUrlList = []
if os.path.isfile('linkusercaches.txt'):
    with open('linkusercaches.txt', 'r') as f:
        userUrlList = f.read().splitlines()

else:
    data = asynchronousGetWebData(urlList, getAllUserURL)
    userUrlList = []
    for item in data:
        userUrlList = userUrlList + list(item)
    userUrlList = set(userUrlList)
    with open(r'linkusercaches.txt', 'w') as fp:
        for item in userUrlList:
            # write each item on a new line
            fp.write("%s\n" % item)
            
print(f'The number of profile link: {len(userUrlList)}')

The number of profile link: 1354


In [141]:
def get_user_info(url):    
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get(url)
    #try:
    #WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))).click()
    #except:
        #pass
    username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'h2.profileHeaderInfo__userName'))).text

    isVertified = driver.find_element(By.CSS_SELECTOR, 'h2.profileHeaderInfo__userName>div').text
    if len(isVertified) > 0: username = username.strip()[:-len(isVertified.strip())].strip()
    userID = url.split('/')[-1]
    userurl = url
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'infoStats__value.sc-font-light')))
    info = driver.find_elements(By.CLASS_NAME, 'infoStats__value.sc-font-light')
    followers = info[0].text
    following = info[1].text
    tracks = info[2].text
    #go to user's sets which have the url https://soundcloud.com/{userID}/sets
    driver.get(f'https://soundcloud.com/{userID}/sets')
    playlistIDs = []
    time.sleep(1)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".emptyNetworkPage__headline, .sc-link-primary.soundTitle__title.sc-link-dark.sc-text-h4")))

    if not driver.find_elements(By.CLASS_NAME, 'emptyNetworkPage__headline'):
        playlist = driver.find_elements(By.CLASS_NAME, 'sc-link-primary.soundTitle__title.sc-link-dark.sc-text-h4')
        if len(playlist) != 0:
            for p in playlist:
                playlistIDs.append(p.get_attribute('href').split('/')[-1])

        
    driver.close()
    return [userID, username, len(isVertified.strip()) > 0, userurl, followers, following, tracks, ";".join(playlistIDs)]

In [None]:
#this cell takes a long time to run (more than 114 minutes to go through 1278 users)
#data = asynchronousGetWebData(userUrlList, get_user_info)
index_range_list = np.linspace(0, len(userUrlList), 6).astype(np.int64)
users = pd.DataFrame(columns=['userID', 'username', 'Verified', 'userUrl', 'followers', 'following', 'tracks', 'playlistIDs'])
for i in range(5):
    start_index = index_range_list[i]
    end_index = index_range_list[i + 1]
    data = asynchronousGetWebData(userUrlList[start_index:end_index], get_user_info)

    mini_user = pd.DataFrame(data, columns=['userID', 'username', 'Verified', 'userUrl', 'followers', 'following', 'tracks', 'playlistIDs'])
    users = pd.concat([users, mini_user])
    
users.to_csv('users.csv', index=False)

### Collect Playlists

In [14]:
users = pd.read_csv('users.csv', index_col="userID")
print(users.shape)
users.head()

(1354, 7)


Unnamed: 0_level_0,username,Verified,userUrl,followers,following,tracks,playlistIDs
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
e-p-beats,P-Town Anthems (E.P. Beats),False,https://soundcloud.com/e-p-beats,510,1327,2,the-arcane-trilogy;prod-by-e-p-beats;killers-i...
mahnorabotdf,Mahnora-Botdf,False,https://soundcloud.com/mahnorabotdf,89,1,14,mahnorabotdf
alecbenjamin,Alec Benjamin,True,https://soundcloud.com/alecbenjamin,223K,0,91,these-two-windows
muhammad-salman-mansoor,Shaikh & Shaikhi,False,https://soundcloud.com/muhammad-salman-mansoor,2043,161,39,
jialaji,Jiala Ji,False,https://soundcloud.com/jialaji,164,204,9,


In [15]:
users["playlistIDs"] = users["playlistIDs"].str.split(';')

In [16]:
userPlaylist = users.explode("playlistIDs").dropna(subset=["playlistIDs"])

In [17]:
listPlaylistURL = userPlaylist[['playlistIDs', 'userUrl']]
listPlaylistURL['playlistUrl'] = listPlaylistURL['userUrl'] + '/sets/' + listPlaylistURL['playlistIDs']
listPlaylistURL.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listPlaylistURL['playlistUrl'] = listPlaylistURL['userUrl'] + '/sets/' + listPlaylistURL['playlistIDs']


In [None]:
def getPlayListInfo(urlPlaylistInfo):
    userID ,urlPlaylist = urlPlaylistInfo[0], urlPlaylistInfo[1]
    
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get(urlPlaylist)
    
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    #check Likes 
    time.sleep(SLEEP_TIME)
    try:
        Likes = driver.find_element(By.CLASS_NAME, 'sc-ministats-likes').find_elements(By.TAG_NAME, 'span')[1].text
    except:
        Likes = None
    #check Reposts
    try:
        Reposts = driver.find_element(By.CLASS_NAME, 'sc-ministats-reposts').find_elements(By.TAG_NAME, 'span')[1].text
    except:
        Reposts = None
    #check Datetime
    try:
        Datetime = driver.find_element(By.CLASS_NAME, 'relativeTime').get_attribute('datetime')
    except:
        Datetime = None
    #check TagContent
    try:
        TagContent = driver.find_element(By.CLASS_NAME, 'sc-tagContent').text.split(',')
    except:
        TagContent = None
    

    driver.close()
    return [userID, urlPlaylist, Likes, Reposts, TagContent, Datetime]

In [None]:
index_range_list = np.linspace(0, len(listPlaylistURL), 10).astype(np.int64)
playlists = pd.DataFrame(columns=['UserIDPlaylist' ,'PlaylistURL', 'Likes', 'Reposts', 'TagContent' , 'Uploadtime'])

for i in range(len(index_range_list) - 1):
    start_index = index_range_list[i]
    end_index = index_range_list[i + 1]
    data = asynchronousGetWebData(listPlaylistURL[["userID", "playlistUrl"]][start_index:end_index].values, getPlayListInfo)

    mini_playlists = pd.DataFrame(data, columns=['UserIDPlaylist' ,'PlaylistURL', 'Likes', 'Reposts', 'TagContent' , 'Uploadtime'])
    playlists = pd.concat([playlists, mini_playlists])


In [None]:
playlists = playlists[~pd.isnull(playlists['Uploadtime'])]
for row in playlists.loc[playlists['TagContent'].isnull(), 'TagContent'].index:
    playlists.at[row, 'TagContent'] = ['']
def fill_correct_form(row):
    if row['TagContent'][0] == '':
        return None
    else:
        return row['TagContent'][0]
playlists['TagContent'] = playlists.apply(fill_correct_form, axis=1)


In [None]:
driver = webdriver.Chrome()
driver.maximize_window()
url_col = list()
for i in range(len(playlists)):
    row = list()
    driver.get(playlists['PlaylistURL'][i])

    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    try:
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'trackItem__trackTitle')))
        list_of_url = driver.find_elements(By.CLASS_NAME, 'trackItem__trackTitle')
        for element in driver.find_elements(By.CLASS_NAME, 'trackItem__trackTitle'):
            row.append(element.get_attribute('href').split('?')[0])
    except:
        pass

    url_col.append(row)
driver.quit()

url_col_new = list()
for row in url_col:
    url_col_new.append(';'.join(row))
playlists.insert(len(playlists.columns), 'trackUrl', url_col_new)

def empty_to_none(row):
    if row['trackUrl'] == '':
        return None
    else:
        return row['trackUrl']

playlists = playlists['trackUrl'].apply(empty_to_none, axis=1)
playlists.to_csv('playlists.csv', index=False)

### Collect Tracks

In [78]:
playlists = pd.read_csv('playlists.csv')
playlists

Unnamed: 0,UserIDPlaylist,PlaylistURL,Likes,Reposts,TagContent,Uploadtime,trackUrl
0,e-p-beats,https://soundcloud.com/e-p-beats/sets/the-arca...,3,3,,2022-01-14T22:11:32.000Z,https://soundcloud.com/lala-land-361191160/arc...
1,e-p-beats,https://soundcloud.com/e-p-beats/sets/prod-by-...,44,18,Hip-hop & Rap,2019-07-30T21:23:37.000Z,https://soundcloud.com/dj_david254/e-p-speaks-...
2,mahnorabotdf,https://soundcloud.com/mahnorabotdf/sets/mahno...,1,,blue,2015-07-22T07:02:08.000Z,https://soundcloud.com/mahnorabotdf/25-hours-c...
3,alecbenjamin,https://soundcloud.com/alecbenjamin/sets/these...,4441,348,,2020-05-29T04:01:28.000Z,https://soundcloud.com/alecbenjamin/mind-is-a-...
4,etherealmedia,https://soundcloud.com/etherealmedia/sets/sgv-...,8,2,,2020-04-22T03:13:36.000Z,https://soundcloud.com/etherealmedia/sgv-snow-...
...,...,...,...,...,...,...,...
2517,tastynetwork,https://soundcloud.com/tastynetwork/sets/blank...,30,4,Dance & EDM,2020-12-04T18:14:49.000Z,https://soundcloud.com/tastynetwork/message-no...
2518,tastynetwork,https://soundcloud.com/tastynetwork/sets/stay-...,60,14,Electronic,2020-09-21T16:04:35.000Z,https://soundcloud.com/tastynetwork/last-islan...
2519,darren-styles,https://soundcloud.com/darren-styles/sets/2022...,73,9,,2022-04-08T11:22:08.000Z,https://soundcloud.com/darren-styles/darren-st...
2520,darren-styles,https://soundcloud.com/darren-styles/sets/darr...,611,117,,2017-05-16T20:17:58.000Z,https://soundcloud.com/monstercat/darren-style...


In [79]:
list_track_url = playlists['trackUrl']
list_track_url = list_track_url.str.split(';')
list_track_url = list_track_url.explode().dropna()
type(list_track_url)

pandas.core.series.Series

In [80]:
def asynchronousGetWebData_2(listData, getDataFunc, *argv, max_screens = 3,):
    '''
    Input:
        - listData: list of urls which need to parse the content to the useful information 
        - getDataFunc - func: Fucntion supports parse each item in listData to get information
            * Parameter base on function's parameter(s)
            * Parameters must be ordered
        - max_screens - int: Maximum screens that open at the same time
        
    Output:
        - Returns list of result data
    '''
    result_value = []

    i_range_list = np.linspace(0, len(listData), max_screens + 1).astype(np.int64)

    thread_list = []
    with ThreadPoolExecutor() as executor:
        for i in range(len(i_range_list) - 1):
            start = i_range_list[i]
            end = i_range_list[i + 1]
            thread_list.append(executor.submit(getDataFunc, listData[start:end], *argv))

    wait(thread_list)

    # thread.result() gonna be a 2D list
    for thread in thread_list:
        if thread.done() and not thread.cancelled():
            result_value.extend(thread.result())

    return result_value
    

In [81]:
def getTrackInfo(url_list):
    driver = webdriver.Chrome()
    driver.maximize_window()

    track_info_list = list()

    for url in url_list: 
        driver.get(url)
        
        # get user that upload the track
        user_id = url.split('/')[3]

        # get url track (for clearer variable)
        url_track = url
        
        time.sleep(5)
        #get plays, likes, reposts container
        try:
            container = driver.find_element(By.CLASS_NAME, 'listenEngagement__stats')
        except:
            container = driver.find_element(By.CLASS_NAME, 'sound__soundStats')
        
        #check plays
        try:
            plays = container.find_element(By.CLASS_NAME, 'sc-ministats-plays').find_elements(By.TAG_NAME, 'span')[1].text
        except:
            plays = None

        #check likes
        try:
            likes = container.find_element(By.CLASS_NAME, 'sc-ministats-likes').find_elements(By.TAG_NAME, 'span')[1].text
        except:
            likes = None

        #check reposts
        try:
            reposts = container.find_element(By.CLASS_NAME, 'sc-ministats-reposts').find_elements(By.TAG_NAME, 'span')[1].text
        except:
            reposts = None

         #check comments
        try:
            comments = driver.find_element(By.CLASS_NAME, 'commentsList__actualTitle').text.split(' ')[0]
        except:
            comments = None

        #check date time
        try:
            date_time = driver.find_element(By.CLASS_NAME, 'relativeTime').get_attribute('datetime')
        except:
            date_time = None

        #check tag content
        try:
            tag_content = driver.find_element(By.CLASS_NAME, 'sc-tagContent').text
        except:
            tag_content = None
        
        track_info_list.append([user_id, url_track, plays, likes, reposts, comments, tag_content, date_time])
    
    driver.quit()

    return track_info_list

In [85]:
index_range_list = np.linspace(0, len(list_track_url), 6).astype(np.int64)
tracks = pd.DataFrame(columns=['UserIDTrack' ,'TrackURL','Plays', 'Likes', 'Reposts', 'Comments', 'TagContent' , 'Uploadtime'])

for i in range(len(index_range_list) - 1):
    start_index = index_range_list[i]
    end_index = index_range_list[i + 1]
    data = asynchronousGetWebData_2(list_track_url[start_index:end_index].values, getTrackInfo)

    mini_tracks = pd.DataFrame(data, columns=['UserIDTrack' ,'TrackURL','Plays', 'Likes', 'Reposts', 'Comments', 'TagContent' , 'Uploadtime'])
    tracks = pd.concat([tracks, mini_tracks])

In [91]:
tracks.to_csv('tracks.csv', index=False)

### Make three file relate to each other

## Exploratory Data Analysis

## References