# Python and Modules

Quick settings and introductions about modules

In [1]:
# % sign here with command means we will display figures inline instead of in a new window
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#sets up pandas table display
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns #sets up styles and gives us more plotting options

In [2]:
u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv(r'http://files.grouplens.org/datasets/movielens/ml-100k/u.user',sep='|',names=u_cols)
users.tail()

Unnamed: 0,user_id,age,sex,occupation,zip_code
938,939,26,F,student,33319
939,940,32,M,administrator,2215
940,941,20,M,student,97229
941,942,48,F,librarian,78209
942,943,22,M,student,77841


In [3]:
m_cols = ['movie_id','title','release_date','video_release_date','imdb_url']
movies = pd.read_csv(r'http://files.grouplens.org/datasets/movielens/ml-100k/u.item',sep='|',names=m_cols,usecols=range(5))
movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)


In [4]:
print(movies.dtypes,'\n',movies.describe())

movie_id                int64
title                  object
release_date           object
video_release_date    float64
imdb_url               object
dtype: object 
           movie_id  video_release_date
count  1682.000000                 0.0
mean    841.500000                 NaN
std     485.695893                 NaN
min       1.000000                 NaN
25%     421.250000                 NaN
50%     841.500000                 NaN
75%    1261.750000                 NaN
max    1682.000000                 NaN


In [5]:
users.occupation.head()
x=set(users.occupation)
x

{'administrator',
 'artist',
 'doctor',
 'educator',
 'engineer',
 'entertainment',
 'executive',
 'healthcare',
 'homemaker',
 'lawyer',
 'librarian',
 'marketing',
 'none',
 'other',
 'programmer',
 'retired',
 'salesman',
 'scientist',
 'student',
 'technician',
 'writer'}

In [6]:
testColumns = ['occupation','sex']
users[testColumns].head()

Unnamed: 0,occupation,sex
0,technician,M
1,other,F
2,writer,M
3,technician,M
4,other,F


In [7]:
print(users.iloc[3])

user_id                4
age                   24
sex                    M
occupation    technician
zip_code           43537
Name: 3, dtype: object


In [8]:
oldusers = users[users.age > 25]
oldusers.count()

user_id       671
age           671
sex           671
occupation    671
zip_code      671
dtype: int64

In [9]:
oldmales = users[(users.age == 40) & (users.sex == 'M')]
oldmales.count()

user_id       14
age           14
sex           14
occupation    14
zip_code      14
dtype: int64

In [10]:
femaleProgrammer = users[(users.sex == 'F') & (users.occupation == 'programmer')]
femaleProgrammer.age.mean()

32.166666666666664

In [11]:
ratings = pd.read_csv(r'http://files.grouplens.org/datasets/movielens/ml-100k/u.data',sep='\t',names=['user_id','movie_id','rating','timestamp'])
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [12]:
grouped_data = ratings.rating.groupby(ratings.movie_id)
average_ratings = grouped_data.mean()
print(average_ratings.head())

movie_id
1    3.878319
2    3.206107
3    3.033333
4    3.550239
5    3.302326
Name: rating, dtype: float64


In [13]:
maximum_rating = average_ratings.max()
good_movie_ids = average_ratings[average_ratings == maximum_rating].index
print(good_movie_ids)

Int64Index([814, 1122, 1189, 1201, 1293, 1467, 1500, 1536, 1599, 1653], dtype='int64', name='movie_id')


In [14]:
print(movies[movies.movie_id.isin(good_movie_ids)].title)

813                         Great Day in Harlem, A (1994)
1121                       They Made Me a Criminal (1939)
1188                                   Prefontaine (1997)
1200           Marlene Dietrich: Shadow and Light (1996) 
1292                                      Star Kid (1997)
1466                 Saint of Fort Washington, The (1993)
1499                            Santa with Muscles (1996)
1535                                 Aiqing wansui (1994)
1598                        Someone Else's America (1995)
1652    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object


In [15]:
ratingscount = grouped_data.count()
#print(ratingscount)
print(ratingscount[average_ratings == maximum_rating])

movie_id
814     1
1122    1
1189    3
1201    1
1293    3
1467    2
1500    2
1536    1
1599    1
1653    1
Name: rating, dtype: int64


In [16]:
newgrouped_data = ratings.rating.groupby(ratings.user_id)
average_ratings = newgrouped_data.mean()
average_ratings.head()

user_id
1    3.610294
2    3.709677
3    2.796296
4    4.333333
5    2.874286
Name: rating, dtype: float64

In [17]:
new_occupation_data = users.sex.groupby(users['occupation'])
new_occupation_data.count()

occupation
administrator     79
artist            28
doctor             7
educator          95
engineer          67
entertainment     18
executive         32
healthcare        16
homemaker          7
lawyer            12
librarian         51
marketing         26
none               9
other            105
programmer        66
retired           14
salesman          12
scientist         31
student          196
technician        27
writer            45
Name: sex, dtype: int64

In [18]:
male_dominant_occupation = new_occupation_data.apply(lambda f:sum(f =='M')>sum(f=='F'))
print(male_dominant_occupation)

occupation
administrator     True
artist            True
doctor            True
educator          True
engineer          True
entertainment     True
executive         True
healthcare       False
homemaker        False
lawyer            True
librarian        False
marketing         True
none              True
other             True
programmer        True
retired           True
salesman          True
scientist         True
student           True
technician        True
writer            True
Name: sex, dtype: bool


In [19]:
import urllib.request
url = "https://www.crummy.com/software/BeautifulSoup"
source = urllib.request.urlopen(url).read().decode('utf-8')
print('Alice' in source)
print(source.count('Soup'))
position = source.find('alien video games')
print(position)
print(source[position:position + len('alien video games')])

False
42
1639
alien video games


In [20]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(source,'lxml')
#print(soup.prettify())
first_tag = soup.findAll('a')
#first_tag.get('href')
link_list = [i.get('href') for i in first_tag]
link_list

['bs4/download/',
 '#Download',
 'bs4/doc/',
 '#HallOfFame',
 'https://code.launchpad.net/beautifulsoup',
 'https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup',
 'http://www.candlemarkandgleam.com/shop/constellation-games/',
 'http://constellation.crummy.com/Constellation%20Games%20excerpt.html',
 'https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup',
 'https://bugs.launchpad.net/beautifulsoup/',
 'http://lxml.de/',
 'http://code.google.com/p/html5lib/',
 'bs4/doc/',
 None,
 'bs4/download/',
 'http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html',
 'download/3.x/BeautifulSoup-3.2.1.tar.gz',
 None,
 'http://www.nytimes.com/2007/10/25/arts/design/25vide.html',
 'https://github.com/reddit/reddit/blob/85f9cff3e2ab9bb8f19b96acd8da4ebacc079f04/r2/r2/lib/media.py',
 'http://www.harrowell.org.uk/viktormap.html',
 'http://svn.python.org/view/tracker/importer/',
 'http://www2.ljworld.com/',
 'http://www.b-list.org/weblog/2010/nov/02/news-done-broke/',
 '

In [21]:
external_links = []
for l in link_list:
    if l is not None and l[:4] == 'http' and l[-4:] == 'html':
        external_links.append(l)
external_links

['http://constellation.crummy.com/Constellation%20Games%20excerpt.html',
 'http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html',
 'http://www.nytimes.com/2007/10/25/arts/design/25vide.html',
 'http://www.harrowell.org.uk/viktormap.html']

In [22]:
[j for j in link_list if j is not None and j.startswith('http') ]

['https://code.launchpad.net/beautifulsoup',
 'https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup',
 'http://www.candlemarkandgleam.com/shop/constellation-games/',
 'http://constellation.crummy.com/Constellation%20Games%20excerpt.html',
 'https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup',
 'https://bugs.launchpad.net/beautifulsoup/',
 'http://lxml.de/',
 'http://code.google.com/p/html5lib/',
 'http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html',
 'http://www.nytimes.com/2007/10/25/arts/design/25vide.html',
 'https://github.com/reddit/reddit/blob/85f9cff3e2ab9bb8f19b96acd8da4ebacc079f04/r2/r2/lib/media.py',
 'http://www.harrowell.org.uk/viktormap.html',
 'http://svn.python.org/view/tracker/importer/',
 'http://www2.ljworld.com/',
 'http://www.b-list.org/weblog/2010/nov/02/news-done-broke/',
 'http://esrl.noaa.gov/gsd/fab/',
 'http://laps.noaa.gov/topograbber/',
 'http://groups.google.com/group/beautifulsoup/',
 'https://launchpad.net/beauti

In [29]:
h3 = soup.find("h3")
print(h3.string)

Beautiful Soup 3


In [37]:
hall_tag = soup.find('ul')
tmp = [l for l in hall_tag.children]
test =  ["".join(str(a) for a in sublist) for sublist in tmp]
print('\n'.join(test))



<a href="http://www.nytimes.com/2007/10/25/arts/design/25vide.html">"Movable
 Type"</a>, a work of digital art on display in the lobby of the New
 York Times building, uses Beautiful Soup to scrape news feeds.


Reddit uses Beautiful Soup to <a href="https://github.com/reddit/reddit/blob/85f9cff3e2ab9bb8f19b96acd8da4ebacc079f04/r2/r2/lib/media.py">parse
a page that's been linked to and find a representative image</a>.


Alexander Harrowell uses Beautiful Soup to <a href="http://www.harrowell.org.uk/viktormap.html">track the business
 activities</a> of an arms merchant.


The developers of Python itself used Beautiful Soup to <a href="http://svn.python.org/view/tracker/importer/">migrate the Python
bug tracker from Sourceforge to Roundup</a>.


The <a href="http://www2.ljworld.com/">Lawrence Journal-World</a>
uses Beautiful Soup to <a href="http://www.b-list.org/weblog/2010/nov/02/news-done-broke/">gather
statewide election results</a>.


The <a href="http://esrl.noaa.gov/gsd/fab/">NO

In [23]:
url = 'http://www.indeed.com/jobs?q=data+scientist&l='
source = urllib.request.urlopen(url).read().decode('UTF-8')
bs_tree = BeautifulSoup(source,'lxml')
job_count_string = bs_tree.find(id = 'searchCount').contents[0] #first child node of bs object: id = 'searchCount'
job_count_string = job_count_string.split()[-1] #split by space then take the last piece
print("Search yielded %s hits" % (job_count_string))

Search yielded 23,559 hits


In [24]:
job_count_digits = [int(d) for d in job_count_string if d.isdigit()]
job_count = np.sum([digit*(10**exponent) for digit,exponent in zip(job_count_digits[::-1],range(len(job_count_digits)))])
print(job_count_digits)
print(job_count)

[2, 3, 5, 5, 9]
23559


In [47]:
import time
num_pages = int(np.ceil(job_count/10))
base_url = 'http://www.indeed.com'
job_links = []
for i in range(1):
    if i%10 == 0:
        print(num_pages - i)
    url = 'http://www.indeed.com/jobs?q=data+scientist&start=' + str(i*10)
    html_page = urllib.request.urlopen(url).read()
    bs_tree = BeautifulSoup(html_page,'lxml')
    job_link_area = bs_tree.find(id = 'resultsCol')
    job_postings = job_link_area.find_all('div')
    job_postings = [jp for jp in job_postings if not jp.get("class") is None
                   and ''.join(jp.get('class')) == "rowresult"]
    job_ids = [jp.get('data-jk') for jp in job_postings]
    #print(job_ids)
    for id in job_ids:
        job_links.append(base_url + '/rc/clk?jk=' + id)
    time.sleep(1)
    
    print("number of job found: ",len(job_links))
    print(job_links)

2356
number of job found:  12
['http://www.indeed.com/rc/clk?jk=224dc9bcdde1f855', 'http://www.indeed.com/rc/clk?jk=293a1db47cfb7b74', 'http://www.indeed.com/rc/clk?jk=ff04082985a54f44', 'http://www.indeed.com/rc/clk?jk=5d086556de87706d', 'http://www.indeed.com/rc/clk?jk=0be3100a047d8270', 'http://www.indeed.com/rc/clk?jk=87d4e1b73c5090a5', 'http://www.indeed.com/rc/clk?jk=5002040cb0697713', 'http://www.indeed.com/rc/clk?jk=edc2796ec2610de4', 'http://www.indeed.com/rc/clk?jk=0d63c8d24c385ff6', 'http://www.indeed.com/rc/clk?jk=f6c72882af0f66f3', 'http://www.indeed.com/rc/clk?jk=52f486aa93db568a', 'http://www.indeed.com/rc/clk?jk=2072d942741d5e13']


In [66]:

#import pickle
#with open(r'c:\users\zishi\lab1.pkl','wb') as f:
#    pickle.dump(job_links,f)
#with open(r'c:\users\zishi\lab1.pkl','rb') as f:
#    job_links = pickle.load(f)
#print(job_links)

import json
#with open(r'c:\users\zishi\lab1.json','w') as f:
#    json.dump(job_links,f)
with open(r'c:\users\zishi\lab1.json','r') as f:
    job_links = json.load(f)

In [72]:
a = {'a': 1, 'b':2, 'c':3}
print(a.keys())
print(a.values())
for k,v in zip(a.keys(),a.values()):
    print(k,v)

dict_keys(['a', 'b', 'c'])
dict_values([1, 2, 3])
a 1
b 2
c 3


In [93]:
stuff_i_like = ['burger', 'sushi', 'sweet potato fries', 'BBQ','beer']
found_happy_hours = []
my_happy_hours = []
url = 'https://www.downtownla.com/explore/dining-nightlife/happy-hour-finder'
source = urllib.request.urlopen(url).read()
table = BeautifulSoup(source)

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en"> <![endif]--><!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en"> <![endif]--><!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en"> <![endif]--><!--[if IE 9]>    <html class="no-js ie9 oldie" lang="en"> <![endif]--><!--[if gt IE 9]><!--><html class="no-js ie" lang="en"> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="EB9CEE9F7D7F6AC0DB0E002984943553" name="msvalidate.01"/>
<meta content="sgJDyR8AMYtC7EbNE_xCV0ZIbIaRtkU8lphhthS5F5A" name="google-site-verification"/>
<meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" name="viewport"/>
<link href="http://www.downtownla.com/" hreflang="en-us" rel="alternate"/>
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
<base href="https://www.downtownla.com/explore/dining-nightlife/happy-hour-finder"/>
<meta content="text



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [90]:
for t in table.find_all('h2', class_='name'):
    text = t.text
    for s in t.findNextSiblings():
        text += '\n' + s.text
    found_happy_hours.append(text)
print('Found %d happy hours' % len(found_happy_hours))

Weekday Happy Hour
Weekday Happy Hour
Noe Bar Happy Hour
Monday Happy Hour
Tuesday - Saturday Happy Hour
Bar & Kitchen 
Big Wangs Sports Bar 
Bonaventure Brewing Company 
Monday through Friday
Weekday Happy Hour
Saturday Happy Hour
Sunday Happy Hour
Great Happy Hour
Taco Tuesdays
Weekday Happy Hour
Weekend Happy Hour
Daily Happy Hour
Weekday Happy Hour
Hoppy Hour
Margarita Hour
Weekday Happy Hour
Daily Happy Hour
Social Hour
Happy Hour
Late Night Happy Hour
Daily Happy Hour
Weekday Happy Hour
Beat The Clock
Beat The Clock - Late Night
Daily Happy Hour
Daily Happy Hour
#NOBSHour 
Happy Hour
Weekday Happy Hour
Weekday Happy Hour
Weekday Happy Hour
After Work Hours
Daily Happy Hour 
Daily Happy Hour 
Hip Hop Happy Hour
Happy Hours 
Happy Hours 
Happy Hours 
Hora Especial
Happy Hour
Happy Hour
Aperitivo Hour
Weekday Happy Hour
Daily Happy Hour
Weekday Happy Hour
Happy Hour
Daily Happy Hour
Found 52 happy hours
