# PROJECT 1: EXPLORATORY DATA ANALYSIS

## Project Name: 

SOUNDCLOUD ANALYSIS

## Assignment Table

| ID            | Fullname              | Contribution rate |
| :-----------: | :-------------------  | :-----------:     |
| 20127323      | Võ Nhật Tân           |                   |
| 20127447      | Ngô Đức Bảo           |                   |
| 20127275      | Lê Nguyễn Nhật Phú    |                   |
| 20127681      | Nguyễn Thiên Phúc     |                   |



# Import Libraries

In [96]:
import numpy as np
import time
import pandas as pd
import random
import requests
import requests_cache
from bs4 import BeautifulSoup
import os.path

In [97]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [74]:
requests_cache.install_cache('cache', expire_after=None)

In [75]:
SLEEP_TIME = 2
SCROLL_PAUSE_TIME = 0.5

## Data Collection

### Collect Users

In [76]:
urlList = [ 'https://soundcloud.com/user-593335594/sets/is-500-songs-the-playlist', \
            'https://soundcloud.com/user-593335594/sets/amazing-artistry-2',\
            'https://soundcloud.com/user-593335594/sets/playlist-that-wont-crash-my',\
            'https://soundcloud.com/user-593335594/sets/my-personal-favorites',\
            'https://soundcloud.com/user-593335594/sets/amazing',\
            'https://soundcloud.com/user-593335594/sets/i-found-more-music',\
            'https://soundcloud.com/user-593335594/sets/chivalry-is-dead',\
            'https://soundcloud.com/idla/sets/old-songs',\
            'https://soundcloud.com/idla/sets/pop-playlist-2020',\
            'https://soundcloud.com/idla/sets/lost-in-the-blues',\
            'https://soundcloud.com/user987150052/sets/hot-playlist',\
            'https://soundcloud.com/user61185041/sets/hot-playlist',\
            'https://soundcloud.com/discover/sets/charts-top:all-music:vn',\
            'https://soundcloud.com/discover/sets/charts-trending:danceedm:vn',\
            'https://soundcloud.com/bytufekci/sets/top-100-songs-of-2020']

In [83]:
def getUserUrl(urlList):
    driver = webdriver.Chrome()
    driver.maximize_window()
    userUrl = set()
    isCheckCookie = True
    for url in urlList:
        driver.get(url)
        time.sleep(SLEEP_TIME)
        if isCheckCookie: 
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))).click()
            isCheckCookie = False
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            # Scroll down to bottom
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            # Wait to load page
            time.sleep(SCROLL_PAUSE_TIME)
            # Calculate new scroll height and compare with last scroll height
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'trackItem__username')))
        newUserUrl = driver.find_elements(By.CLASS_NAME, 'trackItem__username')
        for i in newUserUrl:
            userUrl.add(i.get_attribute('href'))
        print("Total of user's url",len(userUrl))
    driver.close()
    return userUrl


In [103]:
userUrlList = []
if os.path.isfile('linkusercaches.txt'):
    with open('linkusercaches.txt', 'r') as f:
        userUrlList = f.read().splitlines()

else:
    userUrlList = getUserUrl(urlList)
    with open(r'linkusercaches.txt', 'w') as fp:
        for item in userUrlList:
            # write each item on a new line
            fp.write("%s\n" % item)


In [157]:
def get_user_info(userUrl):    
    #create a list of users using dataframes
    users = pd.DataFrame(columns=['userID', 'username', 'Vertified', 'userUrl', 'followers', 'following', 'tracks', 'playlistIDs'])

    driver = webdriver.Chrome()
    driver.maximize_window()

    for index, url in enumerate(userUrl):
        driver.get(url)

        if index == 0: 
            #accept cookie
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))).click()
       
        username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'h2.profileHeaderInfo__userName'))).text
        
        isVertified = driver.find_element(By.CSS_SELECTOR, 'h2.profileHeaderInfo__userName>div').text
        if len(isVertified) > 0: username = username.strip()[:-len(isVertified.strip())].strip()
        userID = url.split('/')[-1]
        userurl = url
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'infoStats__value.sc-font-light')))
        info = driver.find_elements(By.CLASS_NAME, 'infoStats__value.sc-font-light')
        followers = info[0].text
        following = info[1].text
        tracks = info[2].text
        #go to user's sets which have the url https://soundcloud.com/{userID}/sets
        driver.get(f'https://soundcloud.com/{userID}/sets')
        playlistIDs = []

        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".emptyNetworkPage__headline, .sc-link-primary.soundTitle__title.sc-link-dark.sc-text-h4")))
        
        if not driver.find_elements(By.CLASS_NAME, 'emptyNetworkPage__headline'):
            playlist = driver.find_elements(By.CLASS_NAME, 'sc-link-primary.soundTitle__title.sc-link-dark.sc-text-h4')
            if len(playlist) != 0:
                for p in playlist:
                    playlistIDs.append(p.get_attribute('href').split('/')[-1])

        users.loc[index] = [userID, username, len(isVertified.strip()) > 0, userurl, followers, following, tracks, ";".join(playlistIDs)]
    driver.close()
    return users

In [158]:
get_user_info(["https://soundcloud.com/dougiemaclean", "https://soundcloud.com/hvannnef"])

Unnamed: 0,userID,username,Vertified,userUrl,followers,following,tracks,playlistIDs
0,dougiemaclean,Dougie MacLean,False,https://soundcloud.com/dougiemaclean,77,0,89,
1,hvannnef,Hoang Van,False,https://soundcloud.com/hvannnef,754,1,42,nspopkuwmbc0;nghe-vao-ngay-mua-nha;nhac-de-ngu...


In [162]:
#this cell takes a long time to run (more than 114 minutes to go through 1278 users)
users = pd.DataFrame(columns=['userID', 'username', 'Vertified', 'userUrl', 'followers', 'following', 'tracks', 'playlistIDs'])

step = 200
for i in range(0, len(userUrlList), step):
    start = i
    end = i + step
    if end > len(userUrlList):
        end = len(userUrlList)
    users = pd.concat([users, get_user_info(userUrlList[start:end])], axis=0) 
    users.to_csv('users' + str(i) + '.csv', encoding='utf-8-sig', index=False)
    print(users.shape)

(3, 8)


In [50]:
print(users)
users.to_csv('users.csv', index=False)

                                userID                          username  \
0                       ambition-beats                 Ambition Verified   
1                   downchildbluesband              Downchild Blues Band   
2                            blindface                         BlindFace   
3                             leiticia                           Letícia   
4                              adamazz                            NoCtrl   
...                                ...                               ...   
1274                          maroon-5                 Maroon 5 Verified   
1275  five-finger-death-punch-official  Five Finger Death Punch Verified   
1276                    allaboutmaggie                      Maggie Szabo   
1277                        en_network           EN - NETWORK™️ Verified   
1278              david-clayton-thomas              David Clayton-Thomas   

                                                userUrl followers following  \
0       

### Collect Playlists

### Collect Tracks

## Exploratory Data Analysis

## References