In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

from plotly.offline import iplot
import plotly.graph_objs as go
import plotly.plotly as py

import pandas as pd
import time

In [2]:
def get_html(url):
    
    driver = webdriver.Chrome(executable_path=r"chromedriver.exe")
    driver.get(url)
    
    SCROLL_PAUSE_TIME = 3
    
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        time.sleep(SCROLL_PAUSE_TIME)
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    return driver.page_source

In [3]:
def get_df_free_app(url):
    html = get_html(url)
    soup = BeautifulSoup(html)
    
    name_html = soup.find_all("div", {"class": "details"})
    rating_html = soup.find_all("div", {"class":"tiny-star star-rating-non-editable-container"})
    owner_html = soup.find_all("div", {"class":"subtitle-container"})

    name_free_app = []
    
    for name in name_html:
        name = name.find("a", {"class": "title"}).text
        name_free_app.append(name)
    
    for i in range(len(name_free_app)):
        name_free_app[i] = name_free_app[i].strip()
        start = name_free_app[i].find(" ")
        name_free_app[i] = name_free_app[i][start:].strip()
        
    owner_free_app = []
    
    for item in owner_html:
        owner = item.find("a", {"class":"subtitle"}).text
        owner_free_app.append(owner)
        
    rating_free_app = []
    for item in rating_html:
        item = str(item)
        if item.find('\"Средняя оценка:'):
            start = item.find("Средняя оценка:") + len("оценка:")
            finish = item.find("из")
            rating = item[start:finish]
            
            start = rating.find("оценка:")+len("оценка:")
            rating = rating[start:].strip()
            rating = float(rating.replace(",","."))
            rating_free_app.append(rating)
            
    dct = {
        "rating_app": rating_free_app,
        "name_app": name_free_app,
        "owner_app": owner_free_app
    }
    
    df = pd.DataFrame(data=dct)
    return df

In [4]:
def get_df_paid_app(url):
    html = get_html(url)
    soup = BeautifulSoup(html)
    
    name_html = soup.find_all("div", {"class": "details"})
    rating_html = soup.find_all("div", {"class":"tiny-star star-rating-non-editable-container"})
    owner_html = soup.find_all("div", {"class":"subtitle-container"})
    price_html = soup.find_all("button", {"class":"price buy id-track-click id-track-impression"})
    
    name_app = []
    for name in name_html:
        name = name.find("a", {"class": "title"}).text
        name_app.append(name)
    
    for i in range(len(name_app)):
        name_app[i] = name_app[i].strip()
        start = name_app[i].find(" ")
        name_app[i] = name_app[i][start:].strip()
        
    owner_app = []
    for item in owner_html:
        owner = item.find("a", {"class":"subtitle"}).text
        owner_app.append(owner)
        
    rating_app = []
    for item in rating_html:
        item = str(item)
        if item.find('\"Средняя оценка:'):
            start = item.find("Средняя оценка:") + len("оценка:")
            finish = item.find("из")
            rating = item[start:finish]
            
            start = rating.find("оценка:")+len("оценка:")
            rating = rating[start:].strip()
            rating = float(rating.replace(",","."))
            rating_app.append(rating)
    
    first_draft = []
    price_app = []
    for item in price_html:
        price = item.find("span", {"class":"display-price"}).text
        
        price = price.strip()
        end = price.find("грн")
        price = price[:end]
        price = float(price.replace(",","."))
        
        first_draft.append(price)
        
    for index, price in enumerate(first_draft):
        if index%2==0:
            price_app.append(price)
            
    dct = {
        "rating_app": rating_app,
        "price_app": price_app,
        "name_app": name_app,
        "owner_app": owner_app
    }
    
    df = pd.DataFrame(data=dct)
    
    return df

In [5]:
url_free_app = "https://play.google.com/store/apps/collection/topselling_free"
url_paid_app = "https://play.google.com/store/apps/collection/topselling_paid"

free_app = get_df_free_app(url_free_app)
paid_app = get_df_paid_app(url_paid_app)

In [6]:
free_app.head()

Unnamed: 0,rating_app,name_app,owner_app
0,4.8,Мой Говорящий Том 2,Outfit7 Limited
1,4.4,Fire Balls 3D,VOODOO
2,4.6,LIKE - Самое популярное видео-сообщество,BIGO TECHNOLOGY PTE. LTD.
3,4.3,Paper.io 2,VOODOO
4,4.7,Joom,Joom


In [7]:
paid_app.head()

Unnamed: 0,rating_app,price_app,name_app,owner_app
0,4.5,123.5,Майнкрафт,Mojang
1,4.0,6.99,The Sun: Origin,AGaming+
2,3.8,31.99,Assassin’s Creed Идентификация,Ubisoft Entertainment
3,4.5,25.99,Poweramp - разблокировка,Max MP
4,4.5,41.37,Torque Pro (OBD2 / автомобиль),Ian Hawkins


In [8]:
count_rating_free_df = free_app.groupby("rating_app", as_index=False)["name_app"].count()

trace = go.Bar(
    x = count_rating_free_df.rating_app,
    y = count_rating_free_df.name_app
)

layout = go.Layout(
    title = "Рейтинг бесплатных топ приложений в Google Play Украина"
)

fig = go.Figure(data=[trace],layout=layout)
py.iplot(fig)

In [9]:
count_rating_paid_df = paid_app.groupby("rating_app", as_index=False)["name_app"].count()

trace = go.Bar(
    x = count_rating_paid_df.rating_app, 
    y = count_rating_paid_df.name_app
)

layout = go.Layout(
    title = "Рейтинг платных приложений в Google Play Украина")

fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig)