In [484]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import requests
import json
import os
from lxml import etree
import urllib
import time
from datetime import datetime
from selenium import webdriver
from random import randint
from fake_useragent import UserAgent
from tqdm import tqdm
import pickle

In [482]:
category_url = 'https://themeforest.net/category'
ua = UserAgent()

headers = {
    'User-Agent': ua.random,
    'Content-Type': 'text/html'}

response_cat = requests.get(category_url, headers = headers, timeout = 1)

cookies = response_cat.cookies
soup = bs(response_cat.content, 'lxml')

## Create Main DF
theme_df = pd.DataFrame(columns = ['datetime', 'prod_name', 'sort_type', 'prod_rank', 'prod_url', 'prod_author', 
                                  'prod_cat', 'prod_price', 'prod_rating', 'prod_sales', 'prod_details', 
                                  'main_cat'])
    
## Retrieve categories from base url
first_categories = soup.find_all(attrs = {'id': 'first'})
links = first_categories[0].find_all('a')


## Append links to list
category_url_ls = []
category_name_ls = []
for link in links:
    if link.get('href').split('/')[-2] == 'category':
        category_url_ls.append(link.get('href'))
        category_name_ls.append(link.get('href').replace('/category/',''))



browser = webdriver.Chrome()

base_url = 'https://themeforest.net'

sort_dict = {'best_sellers': '&referrer=search&sort=sales&utf8=✓&view=list',
             'trending': '&referrer=search&sort=trending&utf8=✓&view=list',
             'highest_rating': '&referrer=search&sort=rating&utf8=✓&view=list',
             'low_to_high_price': '&referrer=search&sort=price-asc&utf8=✓&view=list',
             'high_to_low_price': '&referrer=search&sort=price-desc&utf8=✓&view=list'}

master_ls = []

## iterate through categories retrieved
for c, cat in enumerate(category_url_ls):
    
    cat_url = base_url + cat
    cat_name = category_name_ls[c]
    print('Starting iteration {} for category {} now!'.format(c + 1, cat_name))
    ## iterate through sort methods
    for sort_key, sort_url in sort_dict.items():
    
        ## iterate through pages
        for i in tqdm(range(1,61)):

            cat_page_url = cat_url + '?page={}'.format(i) + sort_url
            
            time.sleep(randint(1,3))
            
            browser.get(cat_page_url)
                        
            cat_response = browser.page_source
            
            cat_soup = bs(cat_response, 'lxml')

            ## retrieve headings
            prod_heading = cat_soup.find_all('h3', attrs = {'class': 'product-list__heading'})

            ## extract name, url, and rank from heading
            prod_names_ls = []
            prod_urls_ls = []
            prod_ranks_ls = []
            for prod in prod_heading:
                temp_a = prod.find('a')
                prod_url = 'https://themeforest.net' + temp_a.get('href')
                prod_rank = prod_url.split('rank=')[1]
                prod_name = (temp_a.getText())

                prod_names_ls.append(prod_name)
                prod_urls_ls.append(prod_url)
                prod_ranks_ls.append(prod_rank)

            ## retrieve category path
            prod_meta_cat = cat_soup.find_all('span', attrs = {'class': 'meta-categories -no-slash'})

            ## extract category path
            prod_category_ls = []
            for meta_cat in prod_meta_cat:
                temp_ls = []
                for b_tag in meta_cat.find_all('b'):
                    temp_ls.append(b_tag.getText())

                prod_category_ls.append(temp_ls)   

            ## retrieve product details
            prod_details = cat_soup.find_all(attrs = {'class': 'product-list__column-category'})

            ## retrieve product details
            prod_details_ls = []
            for prod_detail in prod_details:
                temp_details = []
                for br, br_tag in enumerate(prod_detail.find('p').getText().split('\n')):
                    if len(br_tag) > 10:
                        temp_details.append(br_tag.replace('    ', ''))
                prod_details_ls.append(temp_details)

            ## retrieve product authors
            prod_author = cat_soup.find_all('div', attrs = {'class': 'product-list__info-author'})

            ## extract authors
            prod_authors_ls = []
            for author in prod_author:
                prod_authors_ls.append(author.find('a').getText())

            ## retrieve product prices
            prod_price = cat_soup.find_all('div', attrs = {'class': 'product-list__column-price'})

            ## extract product prices
            prod_prices_ls = []
            for price in prod_price:
                prod_prices_ls.append(price.find('p').getText().replace('$',''))

            ## retrieve product ratings and sales
            prod_num_ratings = cat_soup.find_all(attrs = {'class': 'product-list__info-desktop'})

            ## extract ratings and sales
            prod_num_ratings_ls = []
            prod_num_sales_ls = []
            for num_rating in prod_num_ratings:
                temp_num_rating = ''
                temp_num_sales = ''
                for text in num_rating.getText().split('\n'):
                    text = text.replace(' ', '')
                    if len(text) > 0:
                        if 'ratings' in text:
                            temp_num_rating = text.split('r')[0]
                        if 'Sales' in text:
                            temp_num_sales = text.split('S')[0]
                prod_num_ratings_ls.append(temp_num_rating)
                prod_num_sales_ls.append(temp_num_sales)

            temp_df = pd.DataFrame({'datetime': now, 'prod_name': prod_names_ls, 'sort_type': sort_key, 'prod_rank': prod_ranks_ls, 
                 'prod_url': prod_urls_ls, 'prod_author': prod_authors_ls, 'prod_cat': prod_category_ls,
                 'prod_price': prod_prices_ls, 'prod_rating': prod_num_ratings_ls,
                 'prod_sales': prod_num_sales_ls,  'prod_details': prod_details_ls, 'main_cat': cat_name})

            temp_dict = temp_df.T.to_dict()
            
            master_ls.extend(temp_dict)

theme_df = pd.DataFrame(master_ls).T         

os.chdir('/home/valesco/Datasets/business_data/theme_forest_scrapes/')

now = datetime.now().strftime('%Y-%d-%m %H:%M')
file_name = 'theme_forest_scrape_' + cat_name + '_' + now + '.csv'

theme_df.to_csv(file_name, index = False, sep = '|')


  0%|          | 0/60 [00:00<?, ?it/s]

Starting iteration 1 for category site-templates now!


[A
  2%|▏         | 1/60 [00:09<08:58,  9.12s/it][A
100%|██████████| 60/60 [07:07<00:00,  6.76s/it]
100%|██████████| 60/60 [07:17<00:00,  7.64s/it]
100%|██████████| 60/60 [07:20<00:00,  7.53s/it]
100%|██████████| 60/60 [07:24<00:00,  7.73s/it]
100%|██████████| 60/60 [07:04<00:00,  7.05s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

Starting iteration 2 for category wordpress now!


100%|██████████| 60/60 [07:06<00:00,  7.33s/it]
100%|██████████| 60/60 [07:16<00:00,  7.62s/it]
100%|██████████| 60/60 [07:00<00:00,  6.58s/it]
100%|██████████| 60/60 [06:58<00:00,  7.11s/it]
100%|██████████| 60/60 [06:57<00:00,  6.76s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

Starting iteration 3 for category cms-themes now!


100%|██████████| 60/60 [07:41<00:00,  6.28s/it]
100%|██████████| 60/60 [06:38<00:00,  6.85s/it]
100%|██████████| 60/60 [06:41<00:00,  7.01s/it]
100%|██████████| 60/60 [06:35<00:00,  6.64s/it]
100%|██████████| 60/60 [06:39<00:00,  6.78s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

Starting iteration 4 for category ecommerce now!


100%|██████████| 60/60 [07:28<00:00,  7.00s/it]
100%|██████████| 60/60 [07:02<00:00,  7.74s/it]
100%|██████████| 60/60 [06:38<00:00,  7.00s/it]
100%|██████████| 60/60 [06:47<00:00,  6.27s/it]
100%|██████████| 60/60 [06:42<00:00,  6.66s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

Starting iteration 5 for category blogging now!


100%|██████████| 60/60 [06:50<00:00,  6.44s/it]
100%|██████████| 60/60 [06:35<00:00,  6.32s/it]
100%|██████████| 60/60 [06:38<00:00,  6.92s/it]
100%|██████████| 60/60 [06:41<00:00,  6.58s/it]
100%|██████████| 60/60 [06:30<00:00,  6.10s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

Starting iteration 6 for category marketing now!


100%|██████████| 60/60 [07:23<00:00,  7.07s/it]
100%|██████████| 60/60 [07:19<00:00,  6.36s/it]
100%|██████████| 60/60 [06:18<00:00,  6.61s/it]
100%|██████████| 60/60 [06:50<00:00,  7.11s/it]
100%|██████████| 60/60 [06:51<00:00,  6.62s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

Starting iteration 7 for category forums now!


100%|██████████| 60/60 [06:44<00:00,  7.03s/it]
100%|██████████| 60/60 [06:36<00:00,  7.10s/it]
100%|██████████| 60/60 [06:32<00:00,  6.29s/it]
100%|██████████| 60/60 [06:46<00:00,  7.05s/it]
100%|██████████| 60/60 [06:52<00:00,  6.79s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

Starting iteration 8 for category psd-templates now!


100%|██████████| 60/60 [07:11<00:00,  6.55s/it]
100%|██████████| 60/60 [07:03<00:00,  8.54s/it]
100%|██████████| 60/60 [06:56<00:00,  7.86s/it]
100%|██████████| 60/60 [06:38<00:00,  6.50s/it]
100%|██████████| 60/60 [06:36<00:00,  6.46s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

Starting iteration 9 for category muse-templates now!


100%|██████████| 60/60 [06:50<00:00,  6.08s/it]
100%|██████████| 60/60 [06:34<00:00,  6.45s/it]
100%|██████████| 60/60 [06:36<00:00,  6.38s/it]
100%|██████████| 60/60 [06:32<00:00,  6.66s/it]
100%|██████████| 60/60 [06:30<00:00,  6.71s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

Starting iteration 10 for category sketch-templates now!


100%|██████████| 60/60 [06:19<00:00,  6.34s/it]
100%|██████████| 60/60 [06:35<00:00,  6.49s/it]
100%|██████████| 60/60 [06:24<00:00,  6.14s/it]
100%|██████████| 60/60 [06:44<00:00,  6.87s/it]
100%|██████████| 60/60 [06:26<00:00,  6.74s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

Starting iteration 11 for category typeengine-themes now!


100%|██████████| 60/60 [06:58<00:00,  7.58s/it]
100%|██████████| 60/60 [06:34<00:00,  6.41s/it]
100%|██████████| 60/60 [06:23<00:00,  6.57s/it]
100%|██████████| 60/60 [06:41<00:00,  6.79s/it]
100%|██████████| 60/60 [06:20<00:00,  6.21s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

Starting iteration 12 for category static-site-generators now!


100%|██████████| 60/60 [06:34<00:00,  6.16s/it]
100%|██████████| 60/60 [06:37<00:00,  6.09s/it]
100%|██████████| 60/60 [06:48<00:00,  6.77s/it]
100%|██████████| 60/60 [06:50<00:00,  6.96s/it]
100%|██████████| 60/60 [06:37<00:00,  6.36s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

Starting iteration 13 for category courses now!


100%|██████████| 60/60 [06:46<00:00,  6.68s/it]
100%|██████████| 60/60 [06:33<00:00,  6.44s/it]
100%|██████████| 60/60 [06:25<00:00,  6.48s/it]
100%|██████████| 60/60 [06:38<00:00,  6.50s/it]
100%|██████████| 60/60 [06:58<00:00,  6.82s/it]


In [477]:
category_url = 'https://themeforest.net/category'
ua = UserAgent()

headers = {
    'User-Agent': ua.random,
    'Content-Type': 'text/html'}

response_cat = requests.get(category_url, headers = headers, timeout = 1)

cookies = response_cat.cookies
soup = bs(response_cat.content, 'lxml')

soup

<!DOCTYPE html>
<!--[if IE 9]> <html class="no-js ie9 fixed-layout" lang="en"> <![endif]--><!--[if gt IE 9]><!--><html class="no-js fixed-layout" lang="en"> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="origin-when-cross-origin" name="referrer"/>
<link href="//d2mdw063ttlqtq.cloudfront.net" rel="dns-prefetch"/>
<link href="//0.s3.envato.com" rel="dns-prefetch"/>
<link href="//thumb-tf.s3.envato.com" rel="dns-prefetch"/>
<link href="//user-profile.s3.envato.com" rel="dns-prefetch"/>
<link href="//image-tf.s3.envato.com" rel="dns-prefetch"/>
<script type="text/javascript">window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"fcf8d519de","applicationID":"13909","transactionName":"NTU0DRQNDwshOmITEAI/PRYUG0wOKiwoCA==","queueTime":0,"applicationTime":234,"agent":"","atts":"DXgvW1wZQRQtPChSS1QOMhwLByUINi0+BFNaeCkKCkBZEzY9KFxTBD8rDAMRFzg0OiIEHhU1NltcQAsTMDg+Sl5ZeHZbDxJBXWZ5fURfR29sV1RWU0l1fnRSXVQvKRwUPQIAISY5UktUFzUDDw4P

In [478]:
## Create Main DF

theme_df = pd.DataFrame(columns = ['datetime', 'prod_name', 'sort_type', 'prod_rank', 'prod_url', 'prod_author', 
                                  'prod_cat', 'prod_price', 'prod_rating', 'prod_sales', 'prod_details', 
                                  'main_cat'])
    
## Retrieve categories from base url

first_categories = soup.find_all(attrs = {'id': 'first'})
links = first_categories[0].find_all('a')


## Append links to list
category_url_ls = []
category_name_ls = []
for link in links:
    if link.get('href').split('/')[-2] == 'category':
        category_url_ls.append(link.get('href'))
        category_name_ls.append(link.get('href').replace('/category/',''))
        
category_name_ls

['site-templates',
 'wordpress',
 'cms-themes',
 'ecommerce',
 'blogging',
 'marketing',
 'forums',
 'psd-templates',
 'muse-templates',
 'sketch-templates',
 'typeengine-themes',
 'static-site-generators',
 'courses']

In [481]:
#temp_df = pd.DataFrame({'prod_name': prod_names_ls, 'sort_type': 'sales', 'prod_rank': prod_ranks_ls, 
#                       'prod_url': prod_urls_ls, 'prod_author': prod_authors_ls, 'prod_cat': prod_category_ls,
#                       'prod_price': prod_prices_ls, 'prod_rating': prod_num_ratings_ls,
#                       'prod_sales': prod_num_sales_ls,  'prod_details': prod_details_ls})

os.chdir('/home/valesco/Datasets/business_data/theme_forest_scrapes/')

now = datetime.now().strftime('%Y-%d-%m %H:%M')

with open('theme_forest_scrape_' + now + '.json', 'w') as outfile:
    json.dump(master_ls, outfile)

In [495]:
temp_dict = temp_df.T.to_dict()

pd.DataFrame(master_ls).T

Unnamed: 0,datetime,main_cat,prod_author,prod_cat,prod_details,prod_name,prod_price,prod_rank,prod_rating,prod_sales,prod_url,sort_type
0,2017-18-04 01:33,courses,tutsplus,"[Courses, Web Design]","[Courses /, Web Design, Closed Captions: No,...",Understanding Responsive Images,5,121,,11.0,https://themeforest.net/item/understanding-res...,high_to_low_price
1,2017-18-04 01:33,courses,tutsplus,"[Courses, Web Design]","[Courses /, Web Design, Closed Captions: No,...",Getting to Know Material Design,5,122,7.0,99.0,https://themeforest.net/item/getting-to-know-m...,high_to_low_price
2,2017-18-04 01:33,courses,tutsplus,"[Courses, Web Design]","[Courses /, Web Design, Closed Captions: No,...",Mastering Icon Fonts on the Web,5,123,,8.0,https://themeforest.net/item/mastering-icon-fo...,high_to_low_price
3,2017-18-04 01:33,courses,tutsplus,"[Courses, Code]","[Courses /, Closed Captions: No, Resolution: 1...",Connect the Web With WebSockets,5,124,,7.0,https://themeforest.net/item/connect-the-web-w...,high_to_low_price
4,2017-18-04 01:33,courses,tutsplus,"[Courses, Code]","[Courses /, Closed Captions: No, Resolution: 1...",What's New in Rails 5?,5,125,,2.0,https://themeforest.net/item/whats-new-in-rail...,high_to_low_price
5,2017-18-04 01:33,courses,tutsplus,"[Courses, Web Design]","[Courses /, Web Design, Closed Captions: No,...",Workshop Your Way Through the Web Design Process,5,126,,18.0,https://themeforest.net/item/workshop-your-way...,high_to_low_price
6,2017-18-04 01:33,courses,tutsplus,"[Courses, Code]","[Courses /, Closed Captions: No, Resolution: 1...",Improving C# With Version 6,5,127,,,https://themeforest.net/item/improving-c-with-...,high_to_low_price
7,2017-18-04 01:33,courses,tutsplus,"[Courses, Code]","[Courses /, Closed Captions: No, Resolution: 1...",Deploy Your Rails Application Into Heroku,5,128,,,https://themeforest.net/item/deploy-your-rails...,high_to_low_price
8,2017-18-04 01:33,courses,tutsplus,"[Courses, Web Design]","[Courses /, Web Design, Closed Captions: No,...",Sketch and CSS: Bridging the Gap,2,129,3.0,65.0,https://themeforest.net/item/sketch-and-css-br...,high_to_low_price


In [344]:
prod_details = cat_soup.find_all(attrs = {'class': 'product-list__column-category'})

## retrieve product details
prod_details_ls = []
for prod_detail in prod_details:
    for br, br_tag in enumerate(prod_detail.find('p').getText().split('\n')):
        if len(br_tag) > 30:
            print(br_tag)
            prod_details_ls.append(br_tag.replace('    ', ''))
            
len(prod_details_ls)

0

In [257]:
temp = cat_soup.find('div', attrs = {'class': 'content-l -size-scale-tablet content-right'})
data_view = temp.find_all('div', attrs = {'data-view': 'productList'})
data_view2 = data_view[0].find_all(attrs = {'data-view': 'bookmarkStatesLoader'})
data_view3 = data_view2[0].find_all(attrs = {'class': 'js-google-analytics__list-event-container'})
container = cat_soup.find_all(attrs = {'class': 'product-list__heading'})

In [337]:
for p, prod in enumerate(prod_details):
    print(p, prod.getText())

0 


   in 

    Site Templates /
    Specialty Pages /

  Under Construction


        High Resolution: Yes, Compatible Browsers: IE9, IE10, IE11, Firefox, Safari, Opera, Chrome, Edge, Compatible With: Bootstrap 3.x, Columns: 2
      

1 


   in 

    Site Templates /

  Mobile


        High Resolution: Yes, Compatible Browsers: IE10, IE11, Firefox, Safari, Opera, Chrome, Edge, Columns: 4+
      

2 


   in 

    Site Templates /
    Specialty Pages /

  Under Construction


        High Resolution: Yes, Compatible Browsers: IE9, IE10, IE11, Firefox, Safari, Opera, Chrome, Compatible With: Bootstrap 3.x
      

3 


   in 

    Site Templates /

  Entertainment


        Compatible Browsers: IE7, IE8, IE9, IE10, IE11, Firefox, Safari, Opera, Chrome, Columns: 4+
      

4 


   in 

    Site Templates /

  Creative


        High Resolution: Yes, Compatible Browsers: IE9, IE10, IE11, Firefox, Safari, Opera, Chrome, Edge, Compatible With: Bootstrap 3.x, Columns: 3
      

5 


   in 

In [406]:
ua = UserAgent()
ua.random

'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36'

In [455]:
os.chdir('/home/valesco/Datasets/business_data/theme_forest_scrapes/')
now = datetime.now().strftime('%Y-%d-%m %H:%M')

theme_df.to_csv('theme_forest_scrape_' + now + '.csv', index = False, sep = '|')

In [456]:
category_name_ls[8]

'muse-templates'