In [1]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import shutil, os, time, datetime, re
import pandas as pd
from PIL import Image
from scraper import DataPreparation, Corpus
from analyzer import Analyzer

# TODO

- carousel finder
- select some features from builtwith

# WORKFLOW

1. Create Corpus with `Corpus.initCorpus()` (once initialized, you do not have to initialize it again).
2. Prepare the website data in the corpus (via `DataPreparation()` class).

    1. Download and analyze images via `DataPreparation.getImages()`.
    2. Once you have downloaded the data for the current corpus, you do not need to download and analyze the images again. You can no access them via `DataPreparation.getImagesFromPickle()`.
    3. Analyze the infrastructure of the websites with `DataPreparation.getBuiltWith()`. Once you have done this, you can access the data via `DataPreparation.getBuiltWithFromPickle()`.
    4. Create and save a dictionary with all the results (except those from `DataPreparation.getBuiltWith()`) via `DataPreparation.createAnalyzerDict()`.

3. The clustering of the data dictionary and the analysis of the `builtwith` data are conducted with the `Analyzer` class.
    1.To cluster the data in the dictionary created with `DataPreparation.createAnalyzerDict()`, the user can choose the columns that should be selected with the help of the method `Analyzer.setColumnSelection()`. The data can then be clustered with the method `Analyzer.clusterDataKMeans()`. However, it is advisable to standardize the data first with the method `Analyzer.standardizeData()`.
    2. To figure out which n works best in the `Analyzer.clusterDataKMeans()` method, the user can use `Analyzer.createElbowPlot()`.
    3. The results of the clustering can be visualized with `Analyzer.visualizeCluster()`-
    4. To analyze the `builtwith` data, the user first needs to execute `Analyzer.getBuiltWithCategorical()`.
    5. Then, the data can be anaylzed using cosine with `Analyzer.getCosine4CategoricalData()`.

# Corpus Creation

In [None]:
cp = Corpus()

In [None]:
cp.initCorpus()

# Data Preparation

In [3]:
dp = DataPreparation()

In [2]:
#dp.getImages()

In [None]:
#bw_dict = dp.getBuiltWith()

In [None]:
#bw_dict = dp.getBuiltWithFromPickle()

In [None]:
#bw_dict

In [4]:
data_dict = dp.createAnalyzerDict()

<DirEntry 'INFO.txt'>
<DirEntry 'www.nytimes.com.html'>
Current netloc nytimes.com of type <class 'str'>


# Analysis

In [5]:
ana = Analyzer()

In [None]:
# Categorical Data from builtwith

In [None]:
bw_data = ana.getBuiltWithCategorical()

In [None]:
bw_clustered = ana.getCosine4CategoricalData(bw_data)

In [None]:
bw_data

In [None]:
bw_clustered

In [None]:
# KMeans clustering and standardization

In [None]:
ana.setColumnSelection(['big_images', "small_images", "total_length"])

In [6]:
ana.getColumnSelection()

Currently, the following columns are selected: ['total_images', 'big_images', 'middle_images', 'small_images', 'background_images', 'total_length', 'external_links', 'internal_links', 'total_links']


In [7]:
ana.data_df

Unnamed: 0,total_images,big_images,middle_images,small_images,background_images,total_length,external_links,internal_links,total_links
nytimes.com,47,17,17,7,6,7229,5,128,133


In [None]:
standardized_data = ana.standardizeData(ana.data_df)

In [None]:
clustered_data = ana.clusterDataKMeans(standardized_data, n=5)

In [None]:
ana.createElbowPlot(standardized_data)

In [None]:
#ana.visualizeCluster(clustered_data, "total_length", "total_images")

In [None]:
clustered_data

In [None]:
normal_df = ana.data_df[["big_images", "small_images", "total_length"]]

In [None]:
normal_df["clusters"] = clustered_data["clusters"]

In [None]:
normal_df

In [None]:
ana.visualizeCluster(normal_df, "total_length", "total_images")

# Testing stuff

In [None]:
#try and get live image sizes

In [59]:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC

In [35]:
wbdriver = webdriver.Edge("webdriver/msedgedriver.exe")

In [60]:
page = wbdriver.get("https://www.nytimes.com")

In [37]:
wbdriver.maximize_window()

In [74]:
imgs = wbdriver.find_elements_by_tag_name("img")

In [76]:
true_counter = 0
for img in imgs:
    print(img.get_attribute("src"))
    print(img.size)
    print(img.is_displayed())
    if img.is_displayed():
        true_counter += 1

https://static01.nyt.com/images/2017/01/29/podcasts/the-daily-album-art/the-daily-album-art-square320-v4.png
{'height': 45, 'width': 45}
True
https://static01.nyt.com/images/2020/09/03/podcasts/sway-album-art/sway-album-art-square320-v10.jpg
{'height': 45, 'width': 45}
True
https://static01.nyt.com/images/2020/12/10/multimedia/tips-bb-promo-image/tips-bb-promo-1607639510132-thumbLarge.png
{'height': 45, 'width': 45}
True
https://static01.nyt.com/images/2021/01/05/us/05GEORGIA-hp-slide-BW5U/05GEORGIA-hp-slide-BW5U-videoSixteenByNine1050-v2.jpg
{'height': 329, 'width': 585}
False
https://static01.nyt.com/images/2021/01/05/us/05GEORGIA-hp-slide-JS40/05GEORGIA-hp-slide-JS40-videoSixteenByNine1050-v2.jpg
{'height': 329, 'width': 585}
True
https://static01.nyt.com/images/2021/01/05/us/05GEORGIA-hp-slide-9XVV/05GEORGIA-hp-slide-9XVV-videoSixteenByNine1050-v2.jpg
{'height': 329, 'width': 585}
False
https://static01.nyt.com/images/2021/01/05/us/05GEORGIA-hp-slide-1RFN/05GEORGIA-hp-slide-1RFN-vi

{'height': 257, 'width': 385}
False
https://static01.nyt.com/images/2020/12/14/nyregion/00nyvirus-publicspace4-copy/00nyvirus-publicspace4-copy-square640.jpg
{'height': 0, 'width': 0}
False
https://static01.nyt.com/images/2021/01/06/dining/06Cookbooks/06Cookbooks-square640.jpg?quality=75&auto=webp&disable=upscale&width=350
{'height': 178, 'width': 178}
True
https://static01.nyt.com/images/2021/01/10/books/review/10Miller/10Miller-square640.jpg?quality=75&auto=webp&disable=upscale&width=350
{'height': 178, 'width': 178}
True
https://static01.nyt.com/images/2021/01/05/obituaries/05roberts1-sub/merlin_181960011_cb2cec62-ea41-4900-8762-1eb9e4fffe64-threeByTwoMediumAt2X.jpg?quality=75&auto=webp&disable=upscale&width=1100
{'height': 307, 'width': 461}
True
https://static01.nyt.com/images/2020/12/30/business/00oligarchdivorce01/00oligarchdivorce01-threeByTwoSmallAt2X-v2.jpg?quality=75&auto=webp&disable=upscale&width=400
{'height': 144, 'width': 216}
True
https://static01.nyt.com/images/2020/1

{'height': 144, 'width': 215}
True
https://static01.nyt.com/images/2021/01/03/opinion/03kristof_SR_print/02kristof-02-threeByTwoSmallAt2X.jpg?quality=75&auto=webp&disable=upscale&width=400
{'height': 144, 'width': 215}
True
https://static01.nyt.com/images/2021/01/04/arts/04bridgerton-race1/merlin_180244809_82916376-4d0b-46c0-8f78-e5affff877d9-threeByTwoSmallAt2X.jpg?quality=75&auto=webp&disable=upscale&width=400
{'height': 144, 'width': 215}
True
https://static01.nyt.com/images/2020/12/30/arts/30moynihan-art-11/30moynihan-art-11-threeByTwoSmallAt2X.jpg?quality=75&auto=webp&disable=upscale&width=400
{'height': 144, 'width': 215}
True
https://static01.nyt.com/images/2021/01/03/arts/03fincher-notebook-bts/03fincher-notebook-bts-threeByTwoSmallAt2X-v2.jpg?quality=75&auto=webp&disable=upscale&width=400
{'height': 144, 'width': 215}
True
https://static01.nyt.com/images/2021/01/04/arts/04spiral1/04spiral1-threeByTwoSmallAt2X-v2.jpg?quality=75&auto=webp&disable=upscale&width=400
{'height': 144

In [63]:
bck = wbdriver.find_elements_by_xpath("//*[contains(@style,'background-image')]")

In [68]:
for bb in bck:
    print(bb.value_of_css_property("background-image"))

url("https://static01.nyt.com/newsgraphics/2020/03/16/coronavirus-maps/5990fca4458a05ec9a94c765b40d83fe0b2485fe/build/curve-grid/cases/total/USA.svg")
url("https://static01.nyt.com/newsgraphics/2020/03/16/coronavirus-maps/5990fca4458a05ec9a94c765b40d83fe0b2485fe/build/curve-grid/cases/total/IND.svg")
url("https://static01.nyt.com/newsgraphics/2020/03/16/coronavirus-maps/5990fca4458a05ec9a94c765b40d83fe0b2485fe/build/curve-grid/cases/total/BRA.svg")
url("https://static01.nyt.com/newsgraphics/2020/03/16/coronavirus-maps/5990fca4458a05ec9a94c765b40d83fe0b2485fe/build/curve-grid/cases/total/GBR.svg")
url("https://static01.nyt.com/newsgraphics/2020/03/16/coronavirus-maps/5990fca4458a05ec9a94c765b40d83fe0b2485fe/build/curve-grid/cases/total/MEX.svg")
url("https://static01.nyt.com/newsgraphics/2020/03/16/coronavirus-maps/5990fca4458a05ec9a94c765b40d83fe0b2485fe/build/curve-grid/cases/total/CAN.svg")


In [72]:
elem = wbdriver.find_element_by_text("Subscribe now")

AttributeError: 'WebDriver' object has no attribute 'find_element_by_text'

In [77]:
true_counter

78

In [78]:
len(imgs)

118