#Table of Contents
* [1. Web Scraping](#1.-Web-Scraping)
* [2. Filbert checking data](#2.-Filbert-checking-data)
* [3. Filbert: run from here onwards](#3.-Filbert:-run-from-here-onwards)

In [2]:
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import json
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import random

In [3]:
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
# The "requests" library makes working with HTTP requests easier
# than the built-in urllib libraries.
import requests

#1. Web Scraping

In [34]:
#PARAMETERS
YEAR_START = 2014
YEAR_END = 2014
TYPE = "feature"

In [35]:
# here we access the webpage and download the content using requests
page=requests.get("http://www.imdb.com/search/title?at=0&sort=alpha&title_type="+TYPE+"&year="+str(YEAR_START)+","+str(YEAR_END))

In [36]:
#First, we get the number of results for this particular year
soup = BeautifulSoup(page.text, "html.parser")
NumResults = soup.find("div", attrs={"class": "leftright"}).find_all("div", attrs={"id": "left"})[0].get_text()
#Output at this point is "\n1-50 of 8,476\ntitles.\n"
NumResults = NumResults.split()[2]

NumResults = int("".join(NumResults.split(',')))   #converting a string of "8,476" to an int of 8476
print NumResults

8997


In [37]:
titles = []
for i in xrange(1,NumResults,50):
    page=requests.get("http://www.imdb.com/search/title?at=0&sort=alpha&start="+str(i)+"&title_type="+TYPE+"&year="+str(YEAR_START)+","+str(YEAR_END))
    soup = BeautifulSoup(page.text, "html.parser")
    rows = soup.find("table", attrs={"class": "results"}).find_all("tr")[1:]
    cleaner = lambda r: [int(r[0].get_text()), r[1].get_text(), r[2].get_text(), r[2].find("a").get("href")]
    titles = titles + [(row.find("td", attrs={"class":"title"}).find("a").get("href")) for row in rows]

print len(titles)

8997


In [38]:
urlcache={} #a dict to store the visited urls

In [39]:
def get_page(url):
    # Check if URL has already been visited.
    if (url not in urlcache) or (urlcache[url]==1) or (urlcache[url]==2):
        time.sleep(1)
        # try/except blocks are used whenever the code could generate an exception (e.g. division by zero).
        # In this case we don't know if the page really exists, or even if it does, if we'll be able to reach it.
        try:
            r = requests.get("http://www.imdb.com/%s" % url)

            if r.status_code == 200:
                urlcache[url] = r.text
            else:
                urlcache[url] = 1
        except:
            urlcache[url] = 2
    return urlcache[url]

In [40]:
start = time.time()
for title in titles:
    get_page(title)
print time.time() - start

14735.039


In [41]:
print np.sum([(urlcache[k]==1) or (urlcache[k]==2) for k in urlcache])# no one or two's
print len(titles)==len(urlcache)#we got all of the urls

0
True


In [45]:
def movie_info(url,page_text):
    info = {}
    info['url'] = url  
    soup = BeautifulSoup(page_text, "html.parser")
    
    box = soup.find("div", attrs={"id": "full_subnav"})
    if box:
        rows = box.find_all("li")

        for row in rows:
            #print row
            #print row.find("a").get_text()
            if row.find("a").get_text() == "Filming Locations":
                #print row.find("a", attrs={"class": "link ghost"})
                if row.find("a", attrs={"class": "link ghost"}):
                    info['location_page'] = "None"
                else:
                    loc_url = url+'locations?ref_=tt_ql_dt_6'
                    try:
                        r = requests.get("http://www.imdb.com/%s" % loc_url)
                        time.sleep(1)
                        if r.status_code == 200:
                            #print 'here'
                            soup2 = BeautifulSoup(r.text,"html.parser")
                            locations = soup2.find_all("div", attrs={"class": "soda"})
                            #print locations
                            temp = []
                            for location in locations:
                                #print location
                                temp.append(location.find("a").get_text().replace("\n", " "))
                            #print temp
                            info['location_page'] = temp   
                        else:
                            info['location_page'] = "fail"
                    except:
                        info['location_page'] = "fail"

    topbar = soup.find("table", attrs={"id": "title-overview-widget-layout"})
    
    #get information from the first info box
    if topbar:
        infobar = topbar.find("div", attrs={"class": "infobar"})
        if infobar:
            if infobar.find("meta", attrs={"itemprop": "contentRating"}):
                info['contentRating'] = infobar.find("meta", attrs={"itemprop": "contentRating"}).get_text().split()[0]
            if infobar.find("time", attrs={"itemprop": "duration"}):
                info['duration'] = infobar.find("time", attrs={"itemprop": "duration"}).get_text().strip()
            genres = infobar.find_all("span", attrs={"itemprop": "genre"})
            temp = []
            for genre in genres:
                temp.append(genre.get_text())
            info['genre'] = temp
            
            #print infobar.find("meta", attrs={"itemprop": "datePublished"}).
            temp = []            
            release_dates = infobar.find_all("a", attrs={"title": "See all release dates"})
            for dates in release_dates:
                temp.append(dates.get_text().replace("\n", " "))                
            info['release_dates'] = temp

    #get name of movie
    if topbar.find("td", attrs={"id": "overview-top"}).find("h1").find("span", attrs={"itemprop":"name"}):
        info['name'] = topbar.find("td", attrs={"id": "overview-top"}).find("h1").find("span", attrs={"itemprop":"name"}).get_text()

    if topbar.find("td", attrs={"id": "overview-top"}).find("h1").find("span", attrs={"class":"nobr"}):
        info['year'] = topbar.find("td", attrs={"id": "overview-top"}).find("h1").find("span", attrs={"class":"nobr"}).get_text()[1:-1]

        
    starbox = topbar.find("div", attrs={"class": "star-box-details"})
    if starbox:
        if starbox.find("span", attrs={"itemprop": "ratingValue"}):
            info['user_ratings'] = starbox.find("span", attrs={"itemprop": "ratingValue"}).get_text()
        if starbox.find("span", attrs={"itemprop": "ratingCount"}):
            info['user_ratings_count'] = starbox.find("span", attrs={"itemprop": "ratingCount"}).get_text()        
        if starbox.find("a", attrs={"href": "criticreviews?ref_=tt_ov_rt"}):
            info['critic_ratings'] = starbox.find("a", attrs={"href": "criticreviews?ref_=tt_ov_rt"}).get_text().strip()      
            
    detailsbox = soup.find("div", attrs={"id": "titleDetails"})
    if detailsbox:
        txtblocks = detailsbox.find_all("div", attrs={"class": "txt-block"})
        for block in txtblocks:
            #print block
            if block.find("h4"):
                if block.find("h4").get_text() == "Country:":
                    info['country'] = block.find("a").get_text()
                if block.find("h4").get_text() == "Language:":
                    info['language'] = block.find("a").get_text()
                if block.find("h4").get_text() == "Budget:":
                    info['budget'] = block.get_text().split()[1]
                if block.find("h4").get_text() == "Opening Weekend:":
                    info['opening_weekend'] = block.get_text().split()[2]
                if block.find("h4").get_text() == "Gross:":
                    info['gross'] = block.get_text().split()[1]
                    #print info['gross']
               
    
    #print info
    return info     

#Testing Code
#k = '/title/tt1951264/'
#r = requests.get("http://www.imdb.com"+k)
#v = r.text

#movie_info(k, v)

In [46]:
movie_info_list=[]

for k,v in urlcache.items():
    movie_info_list.append(movie_info(k, v))

In [9]:
#movie_info_list

In [48]:
fd = open("tempdata/movieinfo.json","w")
json.dump(movie_info_list, fd)
fd.close()

In [12]:
#Import Json
movie_info_list=[]
with open("tempdata/movieinfo.json", "r") as fd:
    movie_info_list = json.load(fd)

In [13]:
df = pd.DataFrame(movie_info_list)

In [14]:
print len(movie_info_list)

0


#2. Filbert checking data

In [15]:
df = pd.read_json("tempdata/movieinfo_2014_full.json")
df = df.append(pd.read_json("tempdata/movieinfo_2013.json"))
df = df.append(pd.read_json("tempdata/movieinfo_2012.json"))
df = df.append(pd.read_json("tempdata/movieinfo_2011.json"))
df = df.append(pd.read_json("tempdata/movieinfo_2010.json"))
df = df.append(pd.read_json("tempdata/movieinfo_2009.json"))

In [16]:
r_df = df[~df["user_ratings"].isnull()].reset_index()

In [17]:
len(r_df)

32788

In [18]:
new_df = r_df[r_df["location_page"]!="None"].reset_index()
new_df

Unnamed: 0,level_0,index,budget,contentRating,country,critic_ratings,duration,genre,gross,language,location_page,name,opening_weekend,release_dates,url,user_ratings,user_ratings_count,year
0,0,0,"€1,500,000",,Kazakhstan,,110 min,[Drama],,Russian,"[Minsk, Belarus , Almaty, Kazakhstan , St. Pet...",Ya ne vernus,,[ 1 March 2014 (Russia) ],/title/tt2637844/,6.9,320,2014
1,2,3,,,USA,,85 min,"[Horror, Mystery, Thriller]",,English,"[Silt, Colorado, USA ]",Find Me,,[ 1 September 2014 (USA) ],/title/tt3027188/,4.5,649,2014
2,3,4,,,Ireland,,88 min,"[Comedy, Drama, Family]",,English,"[Dublin, County Dublin, Ireland , County Wickl...",Gold,,[ 10 October 2014 (Ireland) ],/title/tt3134422/,6.1,406,2014
3,7,10,,Not,USA,,85 min,"[Action, Sci-Fi, Thriller]",,English,"[Los Angeles, California, USA , Long Beach, Ca...",Mega Shark vs. Mecha Shark,,[ 28 January 2014 (USA) ],/title/tt3152098/,2.6,1988,2014
4,8,11,,,Poland,,117 min,"[Drama, Romance]",,Polish,"[Ancona, Marche, Italy , Warsaw, Mazowieckie, ...",Obce cialo,,[ 5 December 2014 (Poland) ],/title/tt3997248/,4.6,156,2014
5,12,19,"$50,000",,USA,,,[Comedy],,English,"[Angelus Oaks, California, USA , California, U...",Camp-Off,,[ 4 April 2014 (USA) ],/title/tt3482042/,7.3,7,2014
6,14,21,,,Germany,,84 min,"[Comedy, Drama]",,German,"[Akademie der Künste, Berlin, Germany , Haus d...",Ich will mich nicht künstlich aufregen,,[ February 2014 (Germany) ],/title/tt3471498/,6.2,28,2014
7,15,23,,,Sweden,,99 min,"[Action, Crime, Drama]",,Swedish,"[Falkenberg, Hallands län, Sweden , Uddevalla,...",Svart kung,,[ 15 March 2014 (Sweden) ],/title/tt2935416/,6.2,42,2014
8,16,24,"$7,100",,USA,,92 min,[Action],,English,"[New York, USA , Paulsboro, New Jersey, USA , ...",Battle,,[ 25 December 2014 (USA) ],/title/tt2094769/,6.6,8,2014
9,20,29,,,UK,,90 min,[Horror],,English,"[Wales, UK ]",Valley of the Witch,,[ 13 January 2015 (USA) ],/title/tt2908340/,5.0,67,2014


In [28]:
len(new_df)

18031

In [49]:
places = set()
places_freq = {}
separated_places = []
for row in new_df["location_page"]:
    temp = []
    for loc in row:
        split = loc.split(",")
        for item in split:
            stripped = item.encode('utf8').strip().lower()
            if stripped!="":
                places.add(stripped)
                temp.append(stripped)
                if stripped not in places_freq.keys():
                    places_freq[stripped] = 1
                places_freq[stripped] = places_freq[stripped] + 1
    separated_places.append(temp)
df_boolean = new_df.copy()
df_boolean['separated_places'] = separated_places

In [28]:
places

{'esher',
 "hell's kitchen",
 'rhineland-palatinate',
 'glamis',
 'value drugs',
 'parc de sceaux',
 'mgm grand hotel - 3799 las vegas boulevard south',
 'tilton',
 'saylorsburg',
 'stonehurst',
 'uithoorn',
 'sant just desvern',
 'mazari sharif',
 'roslyn',
 'khanua',
 'cyprus',
 'la garriga',
 'zittau',
 'laiya-aplaya',
 'luj\xc3\xa1n',
 'lillestr\xc3\xb8m',
 'brownstone street',
 'chishui',
 'kbs suwon studio',
 'thines',
 'isleton',
 'cropvale farm',
 'str\xc3\xb6mstad',
 'colorno',
 '1721 west olive ave',
 "station centrale d'autobus",
 'conejo valley',
 'keighley railway station',
 'hermann',
 'm\xc3\xa9rignac',
 'wayland',
 'saipan',
 'belvoir',
 'kannapolis',
 'puente de ixtla',
 'utrera',
 'saskatchewan',
 'manhatan',
 'grand hotel excelsior',
 'bratislava',
 'cafe formaggio',
 'the concert hall',
 'gulmarg',
 '625 stanwix tower apartments',
 'valley forge national historical park - n. gulph road',
 'roissy-en-france',
 'avondale',
 'kottayam',
 'normansfield hospital theatre'

In [10]:
len(places)

18983

In [24]:
import csv
country_list = set()
with open('tempdata/countries.csv', 'rb') as csvfile:
    countries = csv.reader(csvfile, delimiter=',')
    for country in countries:
        country_list.add(country[1].strip().lower())

#find the places that are not countries
places_no_country = places - country_list
#len(places - country_list)
#len(country_list & places) #number of countries = 176 present in both sets
#print places_no_country

In [40]:
#count  = 0
for place in places_no_country:
        df_boolean[place] = [place in location for location in df_boolean.separated_places]
        #count = count + 1
        #if count > 20:
        #    break
df_boolean

  return key in self._engine
  return self._engine.get_loc(_values_from_object(key))


Unnamed: 0,level_0,index,budget,contentRating,country,critic_ratings,duration,genre,gross,language,location_page,name,opening_weekend,release_dates,url,user_ratings,user_ratings_count,year,separated_places,square des francine,rhineland-palatinate,glamis,value drugs,parc de sceaux,mgm grand hotel - 3799 las vegas boulevard south,tilton,saylorsburg,stonehurst,uithoorn,great otway national park,mazari sharif,roslyn,khanua,new beverly cinema,jim henson company studios - 1416 n. la brea avenue,revere,zittau,laiya-aplaya,luján,lillestrøm,minsk,yogyakarta,indonesia,brownstone street,chishui,47 rue de la goutte d'or,cropvale farm,strömstad,colorno,1721 west olive ave,...,pentrebach,antonito,american falls,tama river,dilijan,sopelana,moab,callanwolde fine arts center - 980 briarcliff road ne,tredegar,trona pinnacles - 300 s. richmond road,nochixtlán,figueras,molskroen,clapham junction,chioggia,newport beach,saschiz,lleida,wellness center,westin bonaventure hotel & suites - 404 s. figueroa street,chaves,london palladium,museu romànic can papiol,wieliszew,howard st & s 11th st,shere,kinde,harefield,place du colonel fabien,vence,whitby,keswick,fort bonifacio,tai-tung,rondo dmowskiego,enontekiö,tarrytown,bortanic gardens hakgala,brechin,2605 west 5th avenue,totes gebirge,calafate,makati city,west sumatra,tremonton,gorham,heathcote,richardson,heide,creuzburg
0,0,0,"€1,500,000",,Kazakhstan,,110 min,[Drama],,Russian,"[Minsk, Belarus , Almaty, Kazakhstan , St. Pet...",Ya ne vernus,,[ 1 March 2014 (Russia) ],/title/tt2637844/,6.9,320,2014,"[minsk, belarus, almaty, kazakhstan, st. peter...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2,3,,,USA,,85 min,"[Horror, Mystery, Thriller]",,English,"[Silt, Colorado, USA ]",Find Me,,[ 1 September 2014 (USA) ],/title/tt3027188/,4.5,649,2014,"[silt, colorado, usa]",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,3,4,,,Ireland,,88 min,"[Comedy, Drama, Family]",,English,"[Dublin, County Dublin, Ireland , County Wickl...",Gold,,[ 10 October 2014 (Ireland) ],/title/tt3134422/,6.1,406,2014,"[dublin, county dublin, ireland, county wicklo...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,7,10,,Not,USA,,85 min,"[Action, Sci-Fi, Thriller]",,English,"[Los Angeles, California, USA , Long Beach, Ca...",Mega Shark vs. Mecha Shark,,[ 28 January 2014 (USA) ],/title/tt3152098/,2.6,1988,2014,"[los angeles, california, usa, long beach, cal...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,8,11,,,Poland,,117 min,"[Drama, Romance]",,Polish,"[Ancona, Marche, Italy , Warsaw, Mazowieckie, ...",Obce cialo,,[ 5 December 2014 (Poland) ],/title/tt3997248/,4.6,156,2014,"[ancona, marche, italy, warsaw, mazowieckie, p...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,12,19,"$50,000",,USA,,,[Comedy],,English,"[Angelus Oaks, California, USA , California, U...",Camp-Off,,[ 4 April 2014 (USA) ],/title/tt3482042/,7.3,7,2014,"[angelus oaks, california, usa, california, us...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,14,21,,,Germany,,84 min,"[Comedy, Drama]",,German,"[Akademie der Künste, Berlin, Germany , Haus d...",Ich will mich nicht künstlich aufregen,,[ February 2014 (Germany) ],/title/tt3471498/,6.2,28,2014,"[akademie der künste, berlin, germany, haus de...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7,15,23,,,Sweden,,99 min,"[Action, Crime, Drama]",,Swedish,"[Falkenberg, Hallands län, Sweden , Uddevalla,...",Svart kung,,[ 15 March 2014 (Sweden) ],/title/tt2935416/,6.2,42,2014,"[falkenberg, hallands län, sweden, uddevalla, ...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
8,16,24,"$7,100",,USA,,92 min,[Action],,English,"[New York, USA , Paulsboro, New Jersey, USA , ...",Battle,,[ 25 December 2014 (USA) ],/title/tt2094769/,6.6,8,2014,"[new york, usa, paulsboro, new jersey, usa, ph...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9,20,29,,,UK,,90 min,[Horror],,English,"[Wales, UK ]",Valley of the Witch,,[ 13 January 2015 (USA) ],/title/tt2908340/,5.0,67,2014,"[wales, uk]",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [41]:
len(df_boolean.columns)

18827

In [46]:
df_boolean.to_csv('df_boolean2.csv',sep=',',encoding='utf-8')

In [None]:
places_freq = []

for row in new_df["location_page"]:
    temp = []
    for loc in row:
        split = loc.split(",")
        for item in split:
            stripped = item.encode('utf8').strip().lower()
            if stripped!="":
                places.add(stripped)
                temp.append(stripped)
    separated_places.append(temp)
df_boolean = new_df.copy()
df_boolean['separated_places'] = separated_places

In [17]:
genreset = set()

for index, row in new_df.iterrows():
    genreslist = row.genre
    for genre in genreslist:
        genreset.add(genre.encode('utf8'))

In [18]:
print len(genreset)
genreset

24


{'Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Drama',
 'Family',
 'Fantasy',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Thriller',
 'War',
 'Western'}

Data Cleaning part

In [36]:
citiesraw = []
citiesfile = open('tempdata/worldcitiespop.txt', 'r')
citiesfile.readline()
while 1:
    line = citiesfile.readline()
    if not line:
        break
    citiesraw.append(line)

In [37]:
print len(cities)

3173958


In [38]:
cities = []
for x in range(len(citiesraw)):
    cities.append(citiesraw[x][:-1].split(',')[1])

In [59]:
print len(cities)
print cities[0:20]
print "usa" in cities

3173958
['aixas', 'aixirivali', 'aixirivall', 'aixirvall', 'aixovall', 'andorra', 'andorra la vella', 'andorra-vieille', 'andorre', 'andorre-la-vieille', 'andorre-vieille', 'ansalonga', 'anyos', 'arans', 'arinsal', 'aubinya', 'auvinya', 'bicisarri', 'bixessarri', 'bixisarri']
True







In [51]:
for entry in df_boolean['separated_places']:
    print entry

['minsk', 'belarus', 'almaty', 'kazakhstan', 'st. petersburg', 'russia']
['silt', 'colorado', 'usa']
['dublin', 'county dublin', 'ireland', 'county wicklow', 'ireland']
['los angeles', 'california', 'usa', 'long beach', 'california', 'usa']
['ancona', 'marche', 'italy', 'warsaw', 'mazowieckie', 'poland', 'moscow', 'russia']
['angelus oaks', 'california', 'usa', 'california', 'usa', 'usa']
['akademie der k\xc3\xbcnste', 'berlin', 'germany', 'haus der kulturen der welt', 'berlin', 'germany']
['falkenberg', 'hallands l\xc3\xa4n', 'sweden', 'uddevalla', 'v\xc3\xa4stra g\xc3\xb6talands l\xc3\xa4n', 'sweden', 'gothenburg', 'v\xc3\xa4stra g\xc3\xb6talands l\xc3\xa4n', 'sweden', 'halmstad', 'hallands l\xc3\xa4n', 'sweden']
['new york', 'usa', 'paulsboro', 'new jersey', 'usa', 'philadelphia', 'pennsylvania', 'usa']
['wales', 'uk']
['denver', 'colorado', 'usa']
['santiago', 'chile']
['lisbon', 'portugal']
['marquam', 'oregon', 'usa', 'silverton', 'oregon', 'usa', 'salem', 'oregon', 'usa', 'corva

In [52]:
df_boolean[:10]

Unnamed: 0,level_0,index,budget,contentRating,country,critic_ratings,duration,genre,gross,language,location_page,name,opening_weekend,release_dates,url,user_ratings,user_ratings_count,year,separated_places
0,0,0,"€1,500,000",,Kazakhstan,,110 min,[Drama],,Russian,"[Minsk, Belarus , Almaty, Kazakhstan , St. Pet...",Ya ne vernus,,[ 1 March 2014 (Russia) ],/title/tt2637844/,6.9,320,2014,"[minsk, belarus, almaty, kazakhstan, st. peter..."
1,2,3,,,USA,,85 min,"[Horror, Mystery, Thriller]",,English,"[Silt, Colorado, USA ]",Find Me,,[ 1 September 2014 (USA) ],/title/tt3027188/,4.5,649,2014,"[silt, colorado, usa]"
2,3,4,,,Ireland,,88 min,"[Comedy, Drama, Family]",,English,"[Dublin, County Dublin, Ireland , County Wickl...",Gold,,[ 10 October 2014 (Ireland) ],/title/tt3134422/,6.1,406,2014,"[dublin, county dublin, ireland, county wicklo..."
3,7,10,,Not,USA,,85 min,"[Action, Sci-Fi, Thriller]",,English,"[Los Angeles, California, USA , Long Beach, Ca...",Mega Shark vs. Mecha Shark,,[ 28 January 2014 (USA) ],/title/tt3152098/,2.6,1988,2014,"[los angeles, california, usa, long beach, cal..."
4,8,11,,,Poland,,117 min,"[Drama, Romance]",,Polish,"[Ancona, Marche, Italy , Warsaw, Mazowieckie, ...",Obce cialo,,[ 5 December 2014 (Poland) ],/title/tt3997248/,4.6,156,2014,"[ancona, marche, italy, warsaw, mazowieckie, p..."
5,12,19,"$50,000",,USA,,,[Comedy],,English,"[Angelus Oaks, California, USA , California, U...",Camp-Off,,[ 4 April 2014 (USA) ],/title/tt3482042/,7.3,7,2014,"[angelus oaks, california, usa, california, us..."
6,14,21,,,Germany,,84 min,"[Comedy, Drama]",,German,"[Akademie der Künste, Berlin, Germany , Haus d...",Ich will mich nicht künstlich aufregen,,[ February 2014 (Germany) ],/title/tt3471498/,6.2,28,2014,"[akademie der künste, berlin, germany, haus de..."
7,15,23,,,Sweden,,99 min,"[Action, Crime, Drama]",,Swedish,"[Falkenberg, Hallands län, Sweden , Uddevalla,...",Svart kung,,[ 15 March 2014 (Sweden) ],/title/tt2935416/,6.2,42,2014,"[falkenberg, hallands län, sweden, uddevalla, ..."
8,16,24,"$7,100",,USA,,92 min,[Action],,English,"[New York, USA , Paulsboro, New Jersey, USA , ...",Battle,,[ 25 December 2014 (USA) ],/title/tt2094769/,6.6,8,2014,"[new york, usa, paulsboro, new jersey, usa, ph..."
9,20,29,,,UK,,90 min,[Horror],,English,"[Wales, UK ]",Valley of the Witch,,[ 13 January 2015 (USA) ],/title/tt2908340/,5.0,67,2014,"[wales, uk]"


In [129]:
separated_cities = []
for x in range(len(df_boolean)):
    temp = df_boolean['separated_places'][x]
    templist = []
    for y in range(len(temp)):
        if temp[y] in citiesflatten:
            templist.append(temp[y])   
    #print templist
    #print temp
    #print
    separated_cities.append(templist)
    if (x%100) == 0:
        print x
    
df_boolean["separated_cities"] = separated_cities

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000




In [87]:
import codecs
cities2raw = []
with codecs.open('tempdata/CountriesToCities.json', 'r', encoding= "UTF-16") as f:
    for line in f:
       cities2raw.append(json.loads(line))

In [178]:
#print cities2raw

In [91]:
cities2 = cities2raw[0].values()

In [99]:
citiesflatten = [item.lower() for sublist in cities2 for item in sublist]

In [100]:
print len(citiesflatten)

86112


In [124]:
test = "Egypt"
for key in cities2raw[0]:
    if test in cities2raw[0][key]:
        print key

United States


In [140]:
df_boolean.head(10)

Unnamed: 0,level_0,index,budget,contentRating,country,critic_ratings,duration,genre,gross,language,location_page,name,opening_weekend,release_dates,url,user_ratings,user_ratings_count,year,separated_places,separated_cities
0,0,0,"€1,500,000",,Kazakhstan,,110 min,[Drama],,Russian,"[Minsk, Belarus , Almaty, Kazakhstan , St. Pet...",Ya ne vernus,,[ 1 March 2014 (Russia) ],/title/tt2637844/,6.9,320,2014,"[minsk, belarus, almaty, kazakhstan, st. peter...","[minsk, almaty, russia]"
1,2,3,,,USA,,85 min,"[Horror, Mystery, Thriller]",,English,"[Silt, Colorado, USA ]",Find Me,,[ 1 September 2014 (USA) ],/title/tt3027188/,4.5,649,2014,"[silt, colorado, usa]","[silt, colorado, usa]"
2,3,4,,,Ireland,,88 min,"[Comedy, Drama, Family]",,English,"[Dublin, County Dublin, Ireland , County Wickl...",Gold,,[ 10 October 2014 (Ireland) ],/title/tt3134422/,6.1,406,2014,"[dublin, county dublin, ireland, county wicklo...","[dublin, ireland, ireland]"
3,7,10,,Not,USA,,85 min,"[Action, Sci-Fi, Thriller]",,English,"[Los Angeles, California, USA , Long Beach, Ca...",Mega Shark vs. Mecha Shark,,[ 28 January 2014 (USA) ],/title/tt3152098/,2.6,1988,2014,"[los angeles, california, usa, long beach, cal...","[los angeles, california, usa, long beach, cal..."
4,8,11,,,Poland,,117 min,"[Drama, Romance]",,Polish,"[Ancona, Marche, Italy , Warsaw, Mazowieckie, ...",Obce cialo,,[ 5 December 2014 (Poland) ],/title/tt3997248/,4.6,156,2014,"[ancona, marche, italy, warsaw, mazowieckie, p...","[ancona, marche, italy, warsaw, poland, moscow..."
5,12,19,"$50,000",,USA,,,[Comedy],,English,"[Angelus Oaks, California, USA , California, U...",Camp-Off,,[ 4 April 2014 (USA) ],/title/tt3482042/,7.3,7,2014,"[angelus oaks, california, usa, california, us...","[california, usa, california, usa, usa]"
6,14,21,,,Germany,,84 min,"[Comedy, Drama]",,German,"[Akademie der Künste, Berlin, Germany , Haus d...",Ich will mich nicht künstlich aufregen,,[ February 2014 (Germany) ],/title/tt3471498/,6.2,28,2014,"[akademie der künste, berlin, germany, haus de...","[berlin, berlin]"
7,15,23,,,Sweden,,99 min,"[Action, Crime, Drama]",,Swedish,"[Falkenberg, Hallands län, Sweden , Uddevalla,...",Svart kung,,[ 15 March 2014 (Sweden) ],/title/tt2935416/,6.2,42,2014,"[falkenberg, hallands län, sweden, uddevalla, ...","[falkenberg, uddevalla, gothenburg, halmstad]"
8,16,24,"$7,100",,USA,,92 min,[Action],,English,"[New York, USA , Paulsboro, New Jersey, USA , ...",Battle,,[ 25 December 2014 (USA) ],/title/tt2094769/,6.6,8,2014,"[new york, usa, paulsboro, new jersey, usa, ph...","[new york, usa, paulsboro, usa, philadelphia, ..."
9,20,29,,,UK,,90 min,[Horror],,English,"[Wales, UK ]",Valley of the Witch,,[ 13 January 2015 (USA) ],/title/tt2908340/,5.0,67,2014,"[wales, uk]",[wales]


In [144]:
df_boolean.to_csv('df_boolean_temp.csv',sep=',',encoding='utf-8')

In [173]:
places3 = set()
places3_freq = {}
for row in df_boolean["separated_cities"]:
    for loc in row:
        places3.add(loc)
        #split = loc.split(",")
        #for item in split:
        #    stripped = item.encode('utf8').strip().lower()
        #    if stripped!="":
        if stripped not in places3_freq.keys():
            places3_freq[stripped] = 1
        places3_freq[stripped] = places3_freq[stripped] + 1
print len(places3)

6536


In [145]:
for place in places2:
        df_boolean[place] = [place in location for location in df_boolean.separated_cities]

#3. Filbert: run from here onwards

In [30]:
new_df_input = pd.read_csv('df_boolean_temp.csv',sep=',',encoding='utf-8')

In [31]:
new_df_input.drop('level_0', axis=1, inplace=True)
new_df_input.drop('Unnamed: 0', axis=1, inplace=True)
new_df_input.head()

Unnamed: 0,index,budget,contentRating,country,critic_ratings,duration,genre,gross,language,location_page,name,opening_weekend,release_dates,url,user_ratings,user_ratings_count,year,separated_places,separated_cities
0,0,"€1,500,000",,Kazakhstan,,110 min,[Drama],,Russian,"[Minsk, Belarus , Almaty, Kazakhstan , St. Pet...",Ya ne vernus,,[ 1 March 2014 (Russia) ],/title/tt2637844/,6.9,320,2014,"[minsk, belarus, almaty, kazakhstan, st. peter...","[minsk, almaty, russia]"
1,3,,,USA,,85 min,"[Horror, Mystery, Thriller]",,English,"[Silt, Colorado, USA ]",Find Me,,[ 1 September 2014 (USA) ],/title/tt3027188/,4.5,649,2014,"[silt, colorado, usa]","[silt, colorado, usa]"
2,4,,,Ireland,,88 min,"[Comedy, Drama, Family]",,English,"[Dublin, County Dublin, Ireland , County Wickl...",Gold,,[ 10 October 2014 (Ireland) ],/title/tt3134422/,6.1,406,2014,"[dublin, county dublin, ireland, county wicklo...","[dublin, ireland, ireland]"
3,10,,Not,USA,,85 min,"[Action, Sci-Fi, Thriller]",,English,"[Los Angeles, California, USA , Long Beach, Ca...",Mega Shark vs. Mecha Shark,,[ 28 January 2014 (USA) ],/title/tt3152098/,2.6,1988,2014,"[los angeles, california, usa, long beach, cal...","[los angeles, california, usa, long beach, cal..."
4,11,,,Poland,,117 min,"[Drama, Romance]",,Polish,"[Ancona, Marche, Italy , Warsaw, Mazowieckie, ...",Obce cialo,,[ 5 December 2014 (Poland) ],/title/tt3997248/,4.6,156,2014,"[ancona, marche, italy, warsaw, mazowieckie, p...","[ancona, marche, italy, warsaw, poland, moscow..."


In [33]:
#Get set of unique cities

def unique_entries(input_df, field):
    entries = set()
    for row in input_df[field]:
        for loc in row:
            entries.add(loc)
    return entries

#clean the column by removing brackets from the reads
def clean_column(input_df, field):
    cleanedrowlist = []
    for row in input_df[field]:
        cleanedrow =  row.replace('[', '')
        cleanedrow =  cleanedrow.replace(']', '')
        cleanedrow = cleanedrow.split(", ")
        cleanedrowlist.append(cleanedrow)
    input_df[field] = cleanedrowlist

In [34]:
newdf_boolean = new_df_input.copy()
clean_column(newdf_boolean, "separated_cities")
clean_column(newdf_boolean, "genre")

In [35]:
places2 = unique_entries(newdf_boolean, "separated_cities")
places2.remove("")
places2.remove("...")
print len(places2) 

6536


In [36]:
genres = unique_entries(newdf_boolean, "genre")
genres.remove("")
print len(genres) 
print genres

24
set([u'Sci-Fi', u'Crime', u'Romance', u'Animation', u'Music', u'Adult', u'Comedy', u'War', u'Horror', u'Western', u'News', u'Reality-TV', u'Thriller', u'Adventure', u'Mystery', u'Drama', u'Action', u'Musical', u'History', u'Family', u'Fantasy', u'Game-Show', u'Sport', u'Biography'])


In [37]:
PCAdf = pd.DataFrame()
#create a column for each city, and set to true if that row contains that city
for place in places2:
        PCAdf[place] = [place in location for location in newdf_boolean.separated_cities]
for genre in genres:
        PCAdf[genre] = [genre in genre_entries for genre_entries in newdf_boolean.genre]

In [38]:
PCAdf = PCAdf.astype(int)
PCAdf

Unnamed: 0,collegeville,tilton,saylorsburg,uithoorn,roslyn,zittau,colorno,hermann,saipan,kannapolis,puente de ixtla,bratislava,roissy-en-france,avondale,kottayam,forest hills,helsingborg,sturgis,vologda,coventry,new iberia,rancho mirage,juan-les-pins,bad zurzach,inkster,chiasso,sankt peter-ording,sanderson,currie,soave,portugalete,bernalillo,inglewood,paphos,port mcneill,creully,san valentino torio,west dundee,china,torre del greco,niagara falls,manvel,gonars,sneek,brownsville,golden,hornsby,inyokern,la boca,nuevo leon,...,evesham,luling,tremonton,finglas,orono,breaza,american falls,boom,tredegar,algeciras,chioggia,chaves,wieliszew,kinde,harefield,vence,richardson,tarrytown,brechin,makati city,portneuf,gorham,heathcote,abergele,heide,creuzburg,Sci-Fi,Crime,Romance,Animation,Music,Adult,Comedy,War,Horror,Western,News,Reality-TV,Thriller,Adventure,Mystery,Drama,Action,Musical,History,Family,Fantasy,Game-Show,Sport,Biography
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [39]:
#random sample of 6000 rows
random.seed(10)

SAMPLESIZE = 10000

newdf_boolean_sampled = new_df_input.copy()
clean_column(newdf_boolean_sampled, "separated_cities")
clean_column(newdf_boolean_sampled, "genre")

rows = random.sample(newdf_boolean_sampled.index, SAMPLESIZE)
newdf_boolean_sampled = newdf_boolean_sampled.ix[rows]

In [40]:
places3 = unique_entries(newdf_boolean_sampled, "separated_cities")
if "" in places3:
    places3.remove("")
if "..." in places3:
    places3.remove("...")
print len(places3) 

4684


In [41]:
genres3 = unique_entries(newdf_boolean_sampled, "genre")
genres3.remove("")
print len(genres3) 
print genres3

24
set([u'Sci-Fi', u'Crime', u'Romance', u'Animation', u'Music', u'Adult', u'Comedy', u'War', u'Horror', u'Western', u'News', u'Reality-TV', u'Thriller', u'Adventure', u'Mystery', u'Drama', u'Action', u'Musical', u'History', u'Family', u'Fantasy', u'Game-Show', u'Sport', u'Biography'])


In [42]:
PCAsampleddf = pd.DataFrame()
#create a column for each city, and set to true if that row contains that city
for place in places3:
        PCAsampleddf[place] = [place in location for location in newdf_boolean_sampled.separated_cities]
for genre in genres3:
        PCAsampleddf[genre] = [genre in genre_entries for genre_entries in newdf_boolean_sampled.genre]

In [43]:
PCAsampleddf = PCAsampleddf.astype(int)
PCAsampleddf

Unnamed: 0,trenton,bartoszyce,brindisi,bad grund,green river,ilulissat,chigasaki,collegeville,wilton manors,sichuan,nottingham,saylorsburg,uithoorn,brantford,roslyn,crete,kassel,piura,zittau,san ignacio,sherman oaks,jaipur,paris,gig harbor,islip,potomac,colorno,geyserville,el campello,las palmas de gran canaria,pushkin,bagalkot,chennai,stanton,puente de ixtla,freudenstadt,wausau,aurora,bryn mawr,folsom,mary esther,bratislava,koprivnica,rapid city,truchas,victoria falls,jenkintown,roissy-en-france,avondale,yucaipa,...,deggendorf,kirkwood,harefield,warner springs,vence,whitby,heide,tarrytown,brechin,carcassonne,banff,montrose,baker,makati city,lloret de mar,charenton-le-pont,merthyr tydfil,scotch plains,oeiras,foster city,nuuk,abergele,westfield,bierset,arenys de mar,creuzburg,Sci-Fi,Crime,Romance,Animation,Music,Adult,Comedy,War,Horror,Western,News,Reality-TV,Thriller,Adventure,Mystery,Drama,Action,Musical,History,Family,Fantasy,Game-Show,Sport,Biography
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


# PCA on size 10,000 data set

In [44]:
from sklearn.decomposition import PCA


pca = PCA(n_components=1500)

#X should consist only of independent variables, leave out Y
new_X = pca.fit_transform(PCAsampleddf)

In [45]:
pca.explained_variance_ratio_

array([  9.58220642e-02,   6.95029293e-02,   6.10438035e-02, ...,
         4.97632721e-05,   4.97455776e-05,   4.97263273e-05])

In [46]:
print pca.explained_variance_ratio_.sum()

0.950669807164


In [51]:
Xdf = pd.DataFrame()

for i in range(pca.explained_variance_ratio_.shape[0]):
    Xdf["pc%i" % (i+1)] = new_X[:,i]

In [52]:
Xdf

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,pc11,pc12,pc13,pc14,pc15,pc16,pc17,pc18,pc19,pc20,pc21,pc22,pc23,pc24,pc25,pc26,pc27,pc28,pc29,pc30,pc31,pc32,pc33,pc34,pc35,pc36,pc37,pc38,pc39,pc40,pc41,pc42,pc43,pc44,pc45,pc46,pc47,pc48,pc49,pc50,...,pc1451,pc1452,pc1453,pc1454,pc1455,pc1456,pc1457,pc1458,pc1459,pc1460,pc1461,pc1462,pc1463,pc1464,pc1465,pc1466,pc1467,pc1468,pc1469,pc1470,pc1471,pc1472,pc1473,pc1474,pc1475,pc1476,pc1477,pc1478,pc1479,pc1480,pc1481,pc1482,pc1483,pc1484,pc1485,pc1486,pc1487,pc1488,pc1489,pc1490,pc1491,pc1492,pc1493,pc1494,pc1495,pc1496,pc1497,pc1498,pc1499,pc1500
0,-0.054671,-0.667735,-0.051509,0.071438,-1.125114,-0.057864,-0.103043,0.182779,-0.555498,0.194886,-0.392265,-0.327183,0.089843,-0.099590,-0.006067,-0.030017,-0.018107,0.046796,0.054153,0.060850,0.095719,0.026290,0.004459,-0.109276,-0.011302,0.108183,0.071200,-0.183546,0.117845,0.187644,0.129084,0.021947,-1.275632,-0.273159,0.059737,0.065458,-0.115415,0.008151,-0.023884,-0.003558,-0.018133,0.042920,-0.046046,-0.017571,0.085560,0.006468,0.002659,0.023285,0.023408,0.016271,...,4.008720e-17,-2.416636e-16,-4.304595e-16,7.977997e-16,6.226902e-16,1.712235e-16,1.525604e-16,-1.282025e-15,-1.746349e-15,2.753446e-16,-1.057925e-15,3.473837e-16,-5.372713e-16,1.085233e-15,3.260995e-17,-7.917314e-16,2.062973e-16,5.142082e-16,7.869819e-17,-8.110699e-16,-8.086204e-16,3.461949e-16,7.381885e-16,1.421125e-15,1.410788e-15,-5.962173e-16,-1.066265e-15,-4.139789e-16,6.291196e-16,-8.305854e-17,-3.496261e-16,-7.025948e-16,2.134925e-16,1.576685e-16,1.877964e-16,3.399812e-15,3.206796e-15,-2.774325e-15,-7.560065e-16,-1.065207e-15,1.576856e-17,1.420429e-15,5.693364e-16,-4.957702e-17,1.999555e-05,0.000077,-8.776649e-05,0.000021,0.000344,0.000039
1,-0.559098,0.528961,0.384261,0.329093,-0.441526,0.241413,-0.314843,0.063717,-0.137712,0.025493,-0.793596,-0.314882,-0.018128,-0.130567,-0.109659,0.167523,0.020994,-0.016639,0.022026,-0.025508,0.086712,-0.002918,-0.043059,0.003805,-0.006483,-0.083932,0.011238,-0.042776,0.050892,0.008383,0.030948,-0.034711,0.030158,-0.022617,0.040735,-0.008339,0.023951,-0.017533,-0.003039,-0.001115,0.018128,0.013170,-0.002603,-0.007103,0.058698,-0.009973,0.009131,-0.011656,0.017952,0.003926,...,5.917524e-16,8.651731e-16,-4.996929e-16,3.358706e-16,-5.105375e-16,-8.964527e-17,-3.657406e-15,3.030893e-15,3.155855e-15,2.908059e-16,-2.161690e-20,-2.101409e-15,1.412665e-15,-5.363769e-16,3.361926e-16,-7.285600e-16,-3.476687e-16,-1.498522e-15,-9.221433e-16,-9.157251e-17,3.371811e-16,9.053273e-16,3.090918e-16,-3.382850e-15,1.003410e-15,-5.047851e-16,1.373651e-15,-7.096861e-17,-3.129580e-16,-2.692528e-15,-5.831443e-16,1.879355e-15,6.923039e-16,3.996768e-16,-1.197872e-15,-5.360439e-15,-5.966023e-15,-3.679515e-15,2.136727e-15,4.391883e-15,-1.535302e-15,1.297292e-15,-2.330791e-15,4.618071e-15,-5.813531e-04,0.000315,7.606191e-04,-0.000952,0.000922,0.001097
2,0.605698,-0.317248,-0.456573,-0.201297,0.027689,0.519174,-0.181412,-0.173207,0.207171,0.085899,0.435236,-0.047706,-0.075669,0.379968,-0.780026,0.329434,-0.091108,-0.202659,-0.365870,-0.026240,-0.027075,0.178040,-0.033242,0.032608,0.018875,0.378307,0.660886,-0.109752,0.215599,0.017960,-0.026119,0.101722,0.091325,0.022219,0.006411,-0.038286,0.043580,-0.012281,-0.022546,-0.001715,0.005481,-0.009004,0.028218,0.002207,0.044016,-0.024214,-0.013173,-0.002810,-0.021994,0.010813,...,1.092658e-15,-2.757171e-16,3.886402e-15,-1.814047e-15,6.258244e-16,6.076209e-15,6.435365e-15,4.617015e-15,-6.172371e-15,1.977967e-15,7.217051e-17,6.018090e-16,-2.434293e-17,9.346439e-15,8.107115e-15,-4.310259e-15,-1.721207e-15,-5.889315e-15,3.980019e-15,2.314435e-15,-1.341890e-14,9.123734e-16,6.541656e-15,7.866776e-15,2.433281e-15,1.269110e-15,-5.362696e-15,4.522589e-15,-3.932512e-15,-3.624034e-16,-1.612495e-15,-2.047293e-15,5.095262e-15,-4.892511e-15,-8.364020e-15,4.488067e-15,8.830712e-15,-8.077671e-15,-1.197664e-14,-1.572264e-14,6.043724e-15,2.985317e-15,-1.870911e-14,-5.543662e-15,6.831391e-03,0.001503,-3.293530e-03,0.006546,-0.000941,0.004943
3,0.122195,0.505141,-0.663684,0.100355,0.431913,-0.596676,0.010536,-0.021697,-0.173054,0.142369,-0.146283,0.175177,0.112980,-0.080447,-0.075686,-0.035606,0.057447,0.007959,-0.082054,-0.069591,-0.007621,0.030923,0.030619,0.024967,0.035686,-0.007254,-0.014101,0.007627,-0.009955,-0.027126,-0.030629,0.009526,-0.017070,-0.022761,-0.022732,0.017235,-0.028117,0.033467,-0.012626,0.010692,0.008444,0.011329,0.016809,-0.006580,0.005021,0.025291,0.003541,0.017156,0.006265,0.020961,...,-1.296166e-15,-7.255212e-16,7.507221e-16,-4.707910e-17,1.750731e-15,4.185143e-16,-1.425053e-15,7.464897e-16,2.226493e-15,6.753062e-16,1.475870e-15,1.155077e-15,-1.153744e-17,-3.734583e-18,-1.679327e-16,1.353520e-15,-3.721086e-16,1.222416e-15,-1.854764e-16,1.339227e-15,5.696431e-16,5.883859e-16,1.994790e-15,9.663866e-17,-4.256813e-16,-2.010611e-16,-2.916989e-16,-3.437479e-17,-1.580179e-15,-7.880518e-16,5.566269e-16,3.233892e-15,2.667525e-16,1.593455e-15,1.836928e-15,-1.446920e-15,-2.140009e-15,1.969364e-15,-1.292466e-15,-4.531181e-16,-1.194158e-15,-1.298969e-15,-9.847258e-16,3.754747e-16,-6.075435e-04,-0.000349,-1.032485e-04,0.000043,-0.000952,0.000449
4,-0.041998,-0.726603,0.165353,0.744768,-0.273193,-0.019832,-0.001257,0.088853,0.172461,-0.198826,0.041263,0.521475,0.040206,-0.043234,0.001881,-0.018921,0.067836,-0.029477,-0.050154,-0.002694,0.010831,0.014748,0.007793,0.034248,0.058374,-0.057279,0.014092,0.010424,-0.031969,-0.005260,-0.017142,-0.000387,0.011644,-0.024535,0.051546,-0.031146,0.018559,0.015220,-0.069552,-0.233002,0.055391,0.020447,0.073096,-0.029886,-0.052624,-0.005949,0.007459,0.020873,0.017398,0.005838,...,-1.476383e-16,3.906170e-16,9.077769e-16,6.475617e-17,-8.574218e-16,-2.254063e-16,-9.887655e-17,-2.709420e-16,-9.900078e-16,-4.230489e-16,-1.281421e-18,4.381618e-16,-5.668985e-16,3.643400e-16,2.292572e-16,1.586448e-16,6.719108e-16,-6.207844e-16,2.560953e-16,5.066166e-16,-4.751248e-16,4.336067e-16,-3.539724e-16,1.003831e-15,-1.750540e-16,8.205857e-16,-2.935549e-16,4.457672e-16,1.792252e-16,8.511997e-16,7.302257e-16,4.415299e-17,-4.055890e-16,3.013510e-16,-1.358731e-15,4.848857e-17,1.324470e-16,1.368873e-16,-1.527337e-16,-4.945309e-16,-3.844110e-17,-1.151929e-16,4.021361e-16,-2.963736e-16,-2.765468e-05,0.000004,-7.485591e-05,-0.000030,0.000016,-0.000198
5,0.526454,0.198326,0.676026,0.019880,0.147098,0.126777,-0.410372,-0.013895,-0.140261,0.164509,0.152867,-0.008618,0.066900,0.058756,-0.003697,0.021351,0.046600,0.046019,-0.033248,-0.072367,0.020168,0.034642,-0.002546,0.033324,0.019236,-0.012971,-0.017365,0.016684,-0.011704,-0.036381,-0.030871,-0.007472,-0.010666,-0.036447,-0.011381,0.001546,-0.019825,0.012429,-0.001714,0.010065,0.004775,0.010429,0.027870,0.014070,0.011569,0.009719,-0.013570,0.006411,0.004013,0.026151,...,-1.067846e-16,1.742374e-16,2.086382e-16,9.486193e-17,-1.137369e-16,-3.182239e-16,1.402516e-16,8.863947e-17,2.986735e-16,-9.271050e-18,-4.355574e-17,-3.687818e-16,-1.279351e-16,1.835577e-16,-8.199389e-17,-1.592227e-16,1.008915e-17,-2.596009e-16,1.197974e-16,2.611041e-17,4.138604e-16,-3.682263e-16,-7.734689e-17,-1.766303e-16,-8.054396e-17,2.051267e-16,-6.512439e-17,1.432439e-16,6.577662e-17,-3.348970e-16,6.488600e-17,7.497556e-17,-7.542684e-17,-2.539668e-16,1.588485e-17,-9.875428e-16,-7.182749e-16,-3.155697e-16,3.553318e-16,5.718118e-16,-1.870692e-15,7.482324e-16,2.041830e-15,5.820424e-16,-1.531455e-05,0.000010,6.493602e-07,-0.000082,0.000010,0.000075
6,-0.471076,0.009133,-0.688812,0.290212,-0.076111,0.581322,0.031535,-0.184394,0.422865,0.030976,0.062750,-0.240041,0.003729,-0.179731,0.021016,-0.033001,-0.004856,-0.132794,-0.006966,-0.007076,0.042108,-0.002402,-0.058551,0.003234,0.033815,-0.089196,0.014400,-0.032340,0.043795,0.005569,-0.005287,-0.005760,0.017773,-0.021387,0.044204,-0.014457,0.023860,0.008334,0.015044,-0.000258,0.003854,0.004560,-0.005071,-0.016765,0.067993,-0.020093,0.012105,-0.014583,0.032532,-0.004662,...,1.900823e-14,-1.445854e-14,-4.444305e-14,2.638086e-14,3.864207e-15,-1.555388e-14,1.518233e-14,-4.435037e-14,8.035394e-14,-1.894324e-14,-3.529367e-15,1.952708e-14,-2.145564e-14,-5.190314e-14,-1.649848e-14,3.352940e-14,2.265501e-14,3.401053e-14,-4.615384e-14,-1.024352e-14,6.227680e-14,-1.087566e-14,-4.159027e-14,-2.260322e-15,-1.127642e-14,-1.638435e-14,8.757758e-15,1.821757e-14,2.416736e-14,2.429644e-14,4.232948e-14,2.958875e-14,-2.705574e-14,-2.114506e-14,3.756865e-14,3.546609e-15,-2.368774e-14,5.313903e-14,3.935091e-14,1.187346e-13,2.726673e-14,5.013274e-15,-9.569751e-14,3.255062e-14,5.324308e-03,0.017453,2.479091e-02,0.011630,-0.014281,-0.002795
7,0.698765,-0.315941,0.039852,-0.067599,0.089791,-0.143577,-0.107363,-0.087877,0.093648,0.059693,-0.014823,0.038775,0.095536,-0.035759,0.026402,-0.001420,0.051390,-0.025779,-0.053682,-0.062338,0.015531,0.037408,0.013288,0.027681,0.074268,-0.032019,-0.013770,0.017530,-0.021612,-0.018519,-0.023931,0.011091,-0.012475,-0.029224,-0.023194,-0.007378,-0.009874,0.033787,0.004126,-0.002408,-0.014468,0.022347,0.031185,0.001940,0.013181,0.004697,-0.002893,0.019717,0.007107,0.013995,...,-3.046073e-16,2.810038e-16,3.250732e-15,-3.320184e-15,-8.819063e-16,5.623192e-15,3.523132e-16,-1.853811e-15,-1.334378e-14,3.812438e-15,-1.391971e-17,-2.169909e-15,-8.670000e-16,5.775115e-15,2.752905e-15,-5.239867e-15,-2.492295e-15,-5.151720e-15,2.715769e-15,-8.533470e-16,-4.606051e-15,2.000068e-15,3.758601e-15,-2.586106e-15,1.889031e-15,-1.788208e-15,-1.468547e-15,-3.234092e-15,-4.205356e-15,-4.788702e-15,-2.645155e-15,-3.597007e-15,3.387627e-15,1.523555e-15,-1.933565e-15,-2.469594e-15,5.889714e-15,-7.042413e-15,-3.314850e-15,-1.090946e-14,-1.652706e-15,-2.553473e-15,1.275682e-14,4.799689e-15,8.871632e-03,0.001711,-1.327792e-03,0.001868,-0.004638,0.001385
8,0.295263,0.399428,-0.265506,-0.147442,-0.022913,-0.301655,0.093535,-0.252241,0.596485,-0.119591,-0.212468,-0.017967,-0.099035,-0.122980,-0.237136,-0.396375,0.041867,0.681001,0.010536,-0.036386,-0.059788,0.043759,-0.013047,-0.036801,0.000182,-0.000583,-0.038890,0.021731,-0.034985,-0.022496,-0.104333,0.006435,-0.013182,-0.030484,-0.085850,-0.042667,0.109335,0.002510,-0.021969,-0.040377,-0.151041,0.741254,-0.345368,-0.061941,-0.172997,-0.041558,-0.046728,0.007632,-0.065676,-0.068551,...,-3.533565e-16,8.287386e-16,-3.626006e-16,-7.799077e-16,5.760467e-16,9.467514e-16,-5.316219e-16,-2.525131e-16,-3.235143e-16,-7.866211e-17,3.914626e-16,-6.165191e-16,-6.094808e-16,6.372449e-16,5.063464e-16,3.976944e-16,-4.454801e-16,-9.899686e-16,-1.705192e-15,-6.614855e-16,-7.760154e-16,-3.106466e-16,4.838006e-16,-9.385842e-16,3.600898e-16,-1.224125e-16,4.661764e-16,-1.755103e-15,2.881638e-16,-2.994896e-16,-6.178267e-16,-1.019825e-15,1.304422e-15,2.128465e-16,3.983541e-16,9.338636e-16,-2.909658e-16,-9.362224e-16,1.950985e-16,-1.812932e-15,2.155393e-15,-1.059719e-16,7.384648e-16,-3.581994e-15,1.167736e-03,-0.000331,2.814856e-04,0.000203,-0.000490,-0.000439
9,0.033157,-0.659073,0.112975,0.404629,-0.087288,-0.135032,-0.069335,-0.016440,0.156077,0.039633,0.006403,-0.251930,-0.015983,-0.098630,0.079630,-0.007305,-0.008658,-0.102933,-0.018735,0.021981,0.037214,-0.006928,-0.038421,0.007938,0.072208,-0.089444,0.017878,-0.032566,0.030934,0.003316,-0.000506,-0.002250,0.012157,-0.022785,0.050654,-0.014478,0.022857,0.008976,0.012668,0.000749,-0.010096,0.010894,-0.004333,-0.010831,0.059478,-0.023308,0.011703,-0.014801,0.042873,-0.009929,...,2.693293e-16,1.841146e-16,7.136089e-17,3.871481e-16,-9.223113e-16,-6.452249e-17,-2.492857e-16,5.968842e-16,1.156358e-16,4.120179e-16,-4.395134e-16,3.429637e-16,-1.989179e-16,4.230911e-16,6.520942e-17,-2.401666e-16,1.166569e-16,-2.193126e-16,-6.635169e-16,-1.237304e-16,3.449505e-16,3.007366e-16,1.612111e-16,1.015098e-16,3.231715e-16,5.549712e-17,-3.391490e-16,-1.371324e-16,-3.814916e-17,-1.871331e-16,-5.143898e-19,-3.772305e-16,1.576911e-16,-2.536755e-16,-2.822102e-16,-5.728247e-16,-5.032697e-16,-1.156553e-15,6.081251e-16,-5.989356e-16,5.919999e-16,3.155584e-16,1.129699e-16,3.150654e-16,-4.594479e-05,0.000012,6.435594e-06,0.000007,-0.000036,0.000168
