In [2]:
import sqlite3
import requests
from bs4 import BeautifulSoup 
from bs4.element import Comment
import random 
import datetime 

In [178]:
#grab visible text on the webpages
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

#find all text using beautiful soup and filter through
def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip()for t in visible_texts)

In [179]:
#store all url's from the text document in a list and opens and reads the text document
sites = []
with open("tennisplayerstitles.txt",'r') as infile :
    for line in infile :
        sites.append(line.strip())

In [None]:
#connects to a database I made and either drops or creates a table after running the command
db = sqlite3.connect("tennis.db")
cur = db.cursor()
cur.execute('''DROP TABLE IF EXISTS player_table''')
cur.execute('''CREATE TABLE player_table (player_url TEXT, player_name TEXT, player_bio TEXT, player_born TEXT)''')

In [None]:
#this is grabbing the url, player name, the bio, and the year they were born from the wikipedia pages and creating columns
for site in sites:
    page = requests.get(site)
    soup = BeautifulSoup(page.text, "html.parser")
    name = text_from_html(str(soup.find_all("h1",attrs={"id":"firstHeading"})))
    text = text_from_html(str(soup.find_all("div", attrs={"id":"bodyContent"})))
    born = text_from_html(str(soup.find_all("span", attrs={"class":"bday"})))
    
    
    new_row = [site, name, text, born]
    cur.execute('''INSERT INTO player_table(player_url, player_name, player_bio, player_born) VALUES(?,?,?,?)''', new_row)
    db.commit()

In [None]:
#closing the database
db.close()

In [1]:
%matplotlib inline

import nltk
from nltk.book import *

import numpy as np
import matplotlib 
import matplotlib.pyplot as plt

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
#open and read the file to analyze the web scraped text, assign it to a list
file_location = "C:\\Users\\Taylor Toepke\\ADA\\applied-data-analytics\\web-scraping\\"
input_file = "tenniswebscrapingproject_TT.txt"
sites = []
with open(input_file,'r',encoding = "Latin-1") as infile :
    next(infile)
    for idx, line in enumerate(infile.readlines()) :
        line = line.strip().split(",")

In [4]:
   # spot 6 has the description
if len(line) >= 7 : # sometimes we don't have descriptions
    sites.extend(line[6].split())
        

In [5]:
#read the file and print some lines
with open(file_location + input_file,'r') as ifile :
    print(ifile.readline(20))


player_url,player_na


In [6]:
#grab the length of the list
len(sites)

60

In [17]:
#frequency distribution on the list
tennis_fd = FreqDist(sites)
tennis_fd.most_common(25)

[('the', 10),
 ('round', 6),
 ('reached', 4),
 ('second', 4),
 ('of', 4),
 ('Open', 4),
 ('but', 2),
 ('French', 2),
 ('and', 2),
 ('US', 2),
 ('.', 2),
 ('Her', 2),
 ('best', 2),
 ('result', 2),
 ('was', 2),
 ('at', 2),
 ('Wimbledon', 2),
 ('where', 2),
 ('she', 2),
 ('fourth', 2)]

In [9]:
#print samples and outcomes of Frequency Distribution 
fd = FreqDist(sites)
fd.most_common(10)
print(fd)

<FreqDist with 20 samples and 60 outcomes>


In [10]:
#cleaning the list and importing stopwords
stopwords = set(nltk.corpus.stopwords.words("english"))
stopwords_sp = set(nltk.corpus.stopwords.words("spanish"))

def clean_list(text) :
    ''' takes a list of text and returns a new list with 
        * words cast to lowercase
        * stopwords removed
        * only alphanumeric words
    '''
    text_clean = [w.lower() for w in text if w.isalpha()]
    text_clean = [w for w in text_clean if w not in stopwords]
    text_clean = [w for w in text_clean if w not in stopwords_sp]
    return(text_clean)

In [11]:
descs_clean_gm = clean_list(sites)

In [12]:
#print samples and outcomes
fd_mp = FreqDist(descs_clean_gm)
fd_mp.most_common(20)
print(fd_mp)

<FreqDist with 10 samples and 30 outcomes>


In [15]:
#print most common clean words
fd_gm = FreqDist(descs_clean_gm)
fd_gm.most_common(50)

[('round', 6),
 ('reached', 4),
 ('second', 4),
 ('open', 4),
 ('french', 2),
 ('us', 2),
 ('best', 2),
 ('result', 2),
 ('wimbledon', 2),
 ('fourth', 2)]

In [14]:
print(descs_clean_gm)

['reached', 'second', 'round', 'french', 'open', 'second', 'round', 'us', 'open', 'best', 'result', 'wimbledon', 'reached', 'fourth', 'round', 'reached', 'second', 'round', 'french', 'open', 'second', 'round', 'us', 'open', 'best', 'result', 'wimbledon', 'reached', 'fourth', 'round']
