# WEBSCRAPING MOVIE RELEASE DATES IN GERMANY

### IMPORT LIBRARIES

In [2]:

from bs4 import BeautifulSoup
import requests
import pandas as pd
from pprint import pprint
from lxml import html
from lxml.html import fromstring
import urllib.request
from urllib.request import urlopen
import random
import re
import time
import subprocess
from youtube_search import YoutubeSearch
#import scrapy


### SCRAPING TARGET

In [3]:
#URL KINO-ZEIT.DE CINEMA PROGRAM
url='https://www.kino-zeit.de/filme/filmstarts/aktuell'

### SETTING SCOPE
In Germany theatrical releases are typically on Thursdays.
Setting 'weeks' to 1 will return last Thursdays theatrical releases.
Setting 'weeks' to 2 will return last Thursdays and upcoming Thursday's theatrical releases.

In [4]:
#CHOOSING NUMBER OF UPCOMING WEEKS TO SCRAPE
print('How many upcoming weeks would you like to scrape at kino-zeit.de\n---')
weeks=int(input())

How many upcoming weeks would you like to scrape at kino-zeit.de
---
1


### FUNCTIONS

In [9]:
#FUNCTION TO TURN URL INTO HTML
def make_soup(url):
    html=requests.get(url).content
    return BeautifulSoup(html,'html.parser')

#FUNCTION CREATING THE URL TO FLIP TO NEXT WEEKS PAGE
def next_url(soup):
    url=''.join([f'https://www.kino-zeit.de/filme/filmstarts/{i["href"]}' for i in soup.find_all("a",{"class": "next btn btn-primary"})])
    return url

#FUNCTION FLIPPING THROUGH THE WEEKLY OVERVIEWS
#Picking film titles, start dates and the film's detail page
def how_much_soup(url, weeks):
    df_loop=pd.DataFrame(columns=['start','film','director','actor','link','image','distro','country','trailer'])
    lst_start=[]
    lst_film=[]
    lst_link=[]

    for x in range(weeks): #for the number of weeks we are browsing
        
        soup= make_soup(url)
        lst_film.append([i.text for i in soup.find_all(['h3','a'],{'class': 'filmlink'})])
        lst_start.append([i.text for i in soup.find_all('div',{'class':'col-md-2 infospalte-1'})])
        lst_start=re.findall(r'\d\d.\d\d.\d\d\d\d', str(lst_start))
        lst_link.append([f'https://www.kino-zeit.de{i["href"]}' for i in soup.find_all(["h3","a"],{"class": "filmlink"})])
        url=next_url(soup) #calling the function for the url we flip to for the following week
    
    lst_film=[i for sublst in lst_film for i in sublst]
    df_loop['film']=lst_film
    df_loop['year']=df_loop['film'].str.findall('\d\d\d\d')
    df_loop['start']=lst_start
    lst_link= [i for sublst in lst_link for i in sublst]
    df_loop['link']=lst_link
    return (df_loop)

#FUNCTION GETTING INFOR FORM FILM DETAIL PAGES
def filmdetails(df, link):
    #CREATING OVERVIEW
    regie_list=[]
    darsteller_list=[]
    list_img_url=[]
    list_verleih=[]
    list_prodland=[]
    
    #BROWSING THE FILMS' INDIVIDUAL PAGES FOR FURTHER DETAILS 
    for i in link:
        sub_page=make_soup(i)
        regie=[i.text.replace('\nRegie','').replace('\n\n','').split(',') for i in sub_page.find_all('div',{'class':'field field--name-field-regie field--type-entity-reference field--label-above'})]
        regie_list.append(regie[0])
        darsteller=[i.text.replace('\nDarsteller','').split('\n') for i in sub_page.find_all('div',{'class':'field field--name-field-hauptdarsteller field--type-entity-reference field--label-above'})]
        darsteller_list.append(darsteller)
        img_url=[i['data-big'].split('?') for i in sub_page.find_all('img',{'id':'headerbildbig'})]
        list_img_url.append(i[0] for i in img_url)
        verleih=[i.text.replace('\nFilmverleih','').replace('\n\n','') for i in sub_page.find_all(['div','a'],{'class':'field field--name-field-distribution-film field--type-entity-reference field--label-above'})]
        list_verleih.append(verleih)
        prodland=[i.text.replace('\nProduktionsland','').replace('\n\n','') for i in sub_page.find_all(['div','a'],{'class':'field field--name-field-produktionsland field--type-entity-reference field--label-above'})]
        list_prodland.append(prodland)

    #FILLING THE DATAFRAME WITH THOSE NEW DATA
    df['director']=regie_list
    df['actor']=darsteller_list
    df['image']=list_img_url
    df['distro']=list_verleih
    df['country']=list_prodland
        
    #CLEAN ACTORS COLUMN
    for i in range(len(df.actor)):
        if len(df['actor'].loc[i])==2:
            df['actor'].loc[i]=df['actor'].loc[i][0]
            df['actor'].loc[i]=list(filter(lambda x: x, list(df['actor'].loc[i])))
    
    #TURN LISTS INTO STRINGS TO CLEAN THEM UP
    for i in range(len(df)):
        df['film']=df['film'].str.replace('\(\d\d\d\d\)','')
        df['country'][i]=''.join(df['country'][i]).replace('\n',', ')
        df['director'][i]=', '.join(df['director'][i]).replace('\n',', ')
        df['actor'][i]=', '.join(df['actor'][i])
        df['image'][i]=''.join(df['image'][i])
        df['distro'][i]=''.join(df['distro'][i]).replace('\n',', ')
    return df

###YOUTUBE TRAILER
#FUNCTION SEARCHING YOUTUBE RETURNING A YOUTUBE LINK
def youtube_link (film, director, distro):
    results = YoutubeSearch(f'{film} {director} {distro} Trailer', max_results=1).to_dict() #returns a dictionar
    if results ==[]:
        return('couldnt find youtube trailer')
    else:
        suffix = results[0]['url_suffix']
        youtube_link= f'www.youtube.com{suffix}'
        return(youtube_link)

# START PROGRAM

In [10]:
#STARTIN PROGRAM
df=how_much_soup(url, weeks)
df=filmdetails(df, df['link'])
   
# CALL FUNCTION FOR YOUTUBE TRAILER
df['trailer']='-'
for i in range(len(df)):
    df['trailer'].loc[i] = youtube_link(df['film'][i], df['director'][i],df['distro'][i])

## EXPORT EXCEL + CSV  WITH TIMESTAMP

In [36]:
timestr = time.strftime("%Y%m%d")
df.to_csv(f'exported_files/Filmstarts_{timestr}.csv')
df.to_excel(f'exported_files/Filmstarts_{timestr}.xlsx')

### RINGS BELL WHEN SCRIPT FINISHED

In [8]:
#RINGS THE BELL WHEN SCRIPT IS DONE
audio_file = "service-bell.mp3"
return_code = subprocess.call(["afplay", audio_file])

In [12]:
df.head()

Unnamed: 0,start,film,director,actor,link,image,distro,country,trailer,year
0,15.10.2020,Astronaut,Shelagh McLeod,"Richard Dreyfuss, Lyriq Bent, Krista Bridges, ...",https://www.kino-zeit.de/node/50228,https://www.kino-zeit.de/sites/default/files/s...,JETS Filmverleih & Vertrieb,Kanada,www.youtube.com/watch?v=vLqK4_AWmro,[2019]
1,15.10.2020,Bruno,Karl Golden,"Diarmaid Murtagh, Woody Norman, Seun Shote, Sc...",https://www.kino-zeit.de/node/48854,https://www.kino-zeit.de/sites/default/files/s...,Filmperlen,Großbritannien,www.youtube.com/watch?v=HnPj7LtDAhA,[2019]
2,15.10.2020,Der Bär in mir,Roman Droux,,https://www.kino-zeit.de/node/50292,https://www.kino-zeit.de/sites/default/files/s...,MFA + Filmdistribution,Schweiz,www.youtube.com/watch?v=JYAGKtI94eI,[2019]
3,15.10.2020,Der geheime Garten,Marc Munden,"Colin Firth, Maeve Dermody, Julie Walters, Dix...",https://www.kino-zeit.de/node/47584,https://www.kino-zeit.de/sites/default/files/s...,STUDIOCANAL GmbH Filmverleih,"Großbritannien, Frankreich",www.youtube.com/watch?v=eEyPvZxC5Mk,[2020]
4,18.10.2020,"Goodbye, America",Sarik Andreasyan,"Lyanka Gryu, Nadezhda Mikhalkova, Vladimir Yag...",https://www.kino-zeit.de/node/50874,https://www.kino-zeit.de/sites/default/files/s...,Kinostar Filmverleih,Russland,www.youtube.com/watch?v=83XgomEylq0,[2020]


### Option to have resulting Excel sent by email

In [40]:
#more complex
'''import os
import smtplib
from email.message import EmailMessage

email_address=os.environ.get('email_user')
email_password=os.environ.get('email_pass')
smtp_ssl_host=DFA_mailserver
smtp_ssl_port = 465

msg= EmailMessage()
msg['Subject']= 'Filmstarts sind exportiert!'
msg['From']=email_address
msg['To']=''
msg.set_content('Hallo, hallo, \n\nDas Excel mit den aktuellen Filmstarts ist anbei.\n\nschöne Grüße!')

with open(f'exported_files/Filmstarts_{timestr}.xlsx', 'rb') as attachment:
    file=attachment.read()

msg.add_attachment(file,maintype='Excel',subtype='xlsx', filename=f'Filmstarts_{timestr}.xlsx')

with smtplib.SMTP_SSL(smtp_ssl_host, smtp_ssl_port) as server:
    server.login(email_address, email_password)
    server.send_message(msg)
    server.quit()'''