# WEBSCRAPING MOVIE RELEASE DATES IN GERMANY

### IMPORT LIBRARIES

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from pprint import pprint
from lxml import html
from lxml.html import fromstring
import urllib.request
from urllib.request import urlopen
import random
import re
import time
import subprocess
from datetime import datetime
import urllib.request
import unidecode
#import scrapy

### SCRAPING TARGET

In [2]:
#URL KINO-ZEIT.DE CINEMA PROGRAM
url='https://www.kino-zeit.de/filme/filmstarts/aktuell'

### SETTING SCOPE
In Germany theatrical releases are typically on Thursdays.
Setting 'weeks' to 1 will return last Thursdays theatrical releases.
Setting 'weeks' to 2 will return last Thursdays and upcoming Thursday's theatrical releases.

In [3]:
#CHOOSING NUMBER OF UPCOMING WEEKS TO SCRAPE
print('How many upcoming weeks would you like to scrape at kino-zeit.de\n---')
weeks=int(input())

How many upcoming weeks would you like to scrape at kino-zeit.de
---
6


### FUNCTIONS

In [4]:
#FUNCTION TO TURN URL INTO HTML
def make_soup(url):
    html=requests.get(url).content
    return BeautifulSoup(html,'html.parser')

#FUNCTION CREATING THE URL TO FLIP TO NEXT WEEKS PAGE
def next_url(soup):
    url=''.join([f'https://www.kino-zeit.de/filme/filmstarts/{i["href"]}' for i in soup.find_all("a",{"class": "next btn btn-primary"})])
    return url

#FUNCTION FLIPPING THROUGH THE WEEKLY OVERVIEWS
#Picking film titles, start dates and the film's detail page
def how_much_soup(url, weeks):
    df_loop=pd.DataFrame(columns=['start','film','director','actor','link','image','distro','country','trailer'])
    lst_start=[]
    lst_film=[]
    lst_link=[]

    for x in range(weeks): #for the number of weeks we are browsing
        
        soup= make_soup(url)
        lst_film.append([i.text for i in soup.find_all(['h3','a'],{'class': 'filmlink'})])
        lst_start.append([i.text for i in soup.find_all('div',{'class':'col-md-2 infospalte-1'})])
        lst_start=re.findall(r'\d\d.\d\d.\d\d\d\d', str(lst_start))
        lst_link.append([f'https://www.kino-zeit.de{i["href"]}' for i in soup.find_all(["h3","a"],{"class": "filmlink"})])
        url=next_url(soup) #calling the function for the url we flip to for the following week
    
    lst_film=[i for sublst in lst_film for i in sublst]
    df_loop['film']=lst_film
    df_loop['year']=df_loop['film'].str.findall('\d\d\d\d')
    df_loop['year']=df_loop['year'].apply(listToString)
    #df_loop['start']=lst_start
    lst_start=[datetime.strptime(i,'%d.%m.%Y') for i in lst_start]
    df_loop['start']=lst_start
    lst_link= [i for sublst in lst_link for i in sublst]
    df_loop['link']=lst_link
    return (df_loop)

In [5]:
# FUNCTION TO TURN LIST TO STRING
#NEEDED FOR CLEANING UP ACTORS' COLUMN
def listToString(s):  
    
    # initialize an empty string 
    str1 = " " 
    
    # return string   
    return (str1.join(s)) 

In [6]:
#FUNCTION GETTING INFOR FORM FILM DETAIL PAGES
def filmdetails(df, link):
    #CREATING OVERVIEW
    regie_list=[]
    darsteller_list=[]
    list_img_url=[]
    list_verleih=[]
    list_prodland=[]
    
    #BROWSING THE FILMS' INDIVIDUAL PAGES FOR FURTHER DETAILS 
    for i in link:
        print(i)
        sub_page=make_soup(i)
        regie=[i.text.replace('\nRegie','').replace('\n\n','').split(',') for i in sub_page.find_all('div',{'class':'field field--name-field-regie field--type-entity-reference field--label-above'})]
        regie_list.append(regie[0])
        darsteller=[i.text.replace('\nDarsteller','').split('\n') for i in sub_page.find_all('div',{'class':'field field--name-field-hauptdarsteller field--type-entity-reference field--label-above'})]
        darsteller_list.append(darsteller)
        img_url=[i['data-big'].split('?') for i in sub_page.find_all('img',{'id':'headerbildbig'})]
        list_img_url.append(i[0] for i in img_url)
        verleih=[i.text.replace('\nFilmverleih','').replace('\n\n','') for i in sub_page.find_all(['div','a'],{'class':'field field--name-field-distribution-film field--type-entity-reference field--label-above'})]
        list_verleih.append(verleih)
        prodland=[i.text.replace('\nProduktionsland','').replace('\n\n','') for i in sub_page.find_all(['div','a'],{'class':'field field--name-field-produktionsland field--type-entity-reference field--label-above'})]
        list_prodland.append(prodland)

    #FILLING THE DATAFRAME WITH THOSE NEW DATA
    df['director']=regie_list
    df['actor']=darsteller_list
    df['image']=list_img_url
    df['distro']=list_verleih
    df['country']=list_prodland
    
    #CLEAN ACTORS COLUMN
    a=[]
    for i in range(len(df.actor)):
        if len(df['actor'].loc[i])==2:
            a.append(list(filter(None,df.actor[i][0])))
        else:
            a.append([''])
        
    df['actor']=a
    df['actor']=df['actor'].apply(listToString)

    #TURN LISTS INTO STRINGS TO CLEAN THEM UP
    for i in range(len(df)):
        
        df['film']=df['film'].str.replace('\(\d\d\d\d\)','')
        df['country'][i]=''.join(df['country'][i]).replace('\n',', ')
        df['director'][i]=', '.join(df['director'][i]).replace('\n',', ')
        df['image'][i]=''.join(df['image'][i])
        df['distro'][i]=''.join(df['distro'][i]).replace('\n',', ')
    return df

In [7]:
### DEALING WITH UMLAUT CHARACTERS
### CURRENTLY NOT IN USE
"""def remove_odd_characters(string):
    
    u = 'ü'.encode()
    U = 'Ü'.encode()
    a = 'ä'.encode()
    A = 'Ä'.encode()
    o = 'ö'.encode()
    O = 'Ö'.encode()
    ss = 'ß'.encode()
    e = 'é' .encode()
    e = 'è' .encode()

    string = string.encode()
    string = string.replace(u, b'ue')
    string = string.replace(U, b'Ue')
    string = string.replace(a, b'ae')
    string = string.replace(A, b'Ae')
    string = string.replace(o, b'oe')
    string = string.replace(O, b'Oe')
    string = string.replace(ss, b'ss')
    string = string.replace(e, b'e')

    string = string.decode('utf-8')
    return string"""

"def remove_odd_characters(string):\n    \n    u = 'ü'.encode()\n    U = 'Ü'.encode()\n    a = 'ä'.encode()\n    A = 'Ä'.encode()\n    o = 'ö'.encode()\n    O = 'Ö'.encode()\n    ss = 'ß'.encode()\n    e = 'é' .encode()\n    e = 'è' .encode()\n\n    string = string.encode()\n    string = string.replace(u, b'ue')\n    string = string.replace(U, b'Ue')\n    string = string.replace(a, b'ae')\n    string = string.replace(A, b'Ae')\n    string = string.replace(o, b'oe')\n    string = string.replace(O, b'Oe')\n    string = string.replace(ss, b'ss')\n    string = string.replace(e, b'e')\n\n    string = string.decode('utf-8')\n    return string"

In [8]:
### DEALING WITH FOREIGN LANGUAGE CHARACTERS

def recoding_string (df):
    trailer_list=[]
    for i in range(len(df)):
        search= f"{df['film'][i].replace(' ','_')}_trailer_{df['distro'][i].replace(' ','_')}"
        #search=remove_odd_characters(search) --- calls function remove umlaut
        search=unidecode.unidecode(search) # dealing with foreign language letters so we can include them in youtube search URL
        print(search)
        html = urllib.request.urlopen(f"https://www.youtube.com/results?search_query={search}")
        video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
        if video_ids == []:
            trailer_list.append('no trailer found')
        else:
            youtube_url= f'https://youtu.be/{video_ids[0]}'
            trailer_list.append(youtube_url)
    return trailer_list

# START PROGRAM

In [9]:
#CALL FUNCTION FILM OVERVIEW
df=how_much_soup(url, weeks)
df


Unnamed: 0,start,film,director,actor,link,image,distro,country,trailer,year
0,2021-10-07,Das Haus (2021),,,https://www.kino-zeit.de/node/52997,,,,,2021
1,2021-10-07,Der Siebzehnte (2020),,,https://www.kino-zeit.de/node/50948,,,,,2020
2,2021-10-07,Der wilde Wald - Natur Natur sein lassen (2021),,,https://www.kino-zeit.de/node/51868,,,,,2021
3,2021-10-07,Die Jönsson Bande (2020),,,https://www.kino-zeit.de/node/53180,,,,,2020
4,2021-10-07,Die Verschwundene (2019),,,https://www.kino-zeit.de/node/48086,,,,,2019
...,...,...,...,...,...,...,...,...,...,...
87,2021-11-11,Narren (2019),,,https://www.kino-zeit.de/node/48852,,,,,2019
88,2021-11-11,Speer goes to Hollywood (2021),,,https://www.kino-zeit.de/node/53038,,,,,2021
89,2021-11-14,The Human Voice (2020),,,https://www.kino-zeit.de/node/50922,,,,,2020
90,2021-11-11,Who's afraid of Alice Miller? (2020),,,https://www.kino-zeit.de/node/52475,,,,,2020


In [10]:
"""
# Spezical Fall defekter Film am 20. August
df= df[df['film']!='Date A Bullett (2020)'].reset_index(drop=True)
"""

"\n# Spezical Fall defekter Film am 20. August\ndf= df[df['film']!='Date A Bullett (2020)'].reset_index(drop=True)\n"

In [11]:
#CALL FUNCTION FILM DETAILS
df=filmdetails(df, df['link'])
df

https://www.kino-zeit.de/node/52997
https://www.kino-zeit.de/node/50948
https://www.kino-zeit.de/node/51868
https://www.kino-zeit.de/node/53180
https://www.kino-zeit.de/node/48086
https://www.kino-zeit.de/node/52130
https://www.kino-zeit.de/node/53226
https://www.kino-zeit.de/node/52319
https://www.kino-zeit.de/node/52747
https://www.kino-zeit.de/node/51321
https://www.kino-zeit.de/node/52903
https://www.kino-zeit.de/node/50583
https://www.kino-zeit.de/node/52474
https://www.kino-zeit.de/node/51789
https://www.kino-zeit.de/node/52005
https://www.kino-zeit.de/node/52638
https://www.kino-zeit.de/node/52029
https://www.kino-zeit.de/node/52524
https://www.kino-zeit.de/node/42497
https://www.kino-zeit.de/node/48177
https://www.kino-zeit.de/node/51231
https://www.kino-zeit.de/node/51440
https://www.kino-zeit.de/node/51379
https://www.kino-zeit.de/node/50946
https://www.kino-zeit.de/node/53023
https://www.kino-zeit.de/node/52488
https://www.kino-zeit.de/node/48483
https://www.kino-zeit.de/nod

  df['film']=df['film'].str.replace('\(\d\d\d\d\)','')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['country'][i]=''.join(df['country'][i]).replace('\n',', ')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['director'][i]=', '.join(df['director'][i]).replace('\n',', ')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['image'][i]=''.join(df['image'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,start,film,director,actor,link,image,distro,country,trailer,year
0,2021-10-07,Das Haus,Rick Ostermann,Tobias Moretti Valery Tscheplanowa Lisa Vicari...,https://www.kino-zeit.de/node/52997,https://www.kino-zeit.de/sites/default/files/s...,"not, sold.",Deutschland,,2021
1,2021-10-07,Der Siebzehnte,"Saskia Walker, Ralf Walker",Saskia Walker Devid Striesow Ralf Walker,https://www.kino-zeit.de/node/50948,https://www.kino-zeit.de/sites/default/files/s...,déjà-vu film UG,Deutschland,,2020
2,2021-10-07,Der wilde Wald - Natur Natur sein lassen,Lisa Eder,,https://www.kino-zeit.de/node/51868,https://www.kino-zeit.de/sites/default/files/s...,Mindjazz Pictures,Deutschland,,2021
3,2021-10-07,Die Jönsson Bande,Tomas Alfredson,Henrik Dorsin Anders Johansson Hedda Stiernste...,https://www.kino-zeit.de/node/53180,https://www.kino-zeit.de/sites/default/files/s...,STUDIOCANAL GmbH Filmverleih,Schweden,,2020
4,2021-10-07,Die Verschwundene,Dominik Moll,Laure Calamy Denis Menochet Valeria Bruni Tede...,https://www.kino-zeit.de/node/48086,https://www.kino-zeit.de/sites/default/files/s...,OVALmedia,"Frankreich, Deutschland",,2019
...,...,...,...,...,...,...,...,...,...,...
87,2021-11-11,Narren,"Sigrun Köhler, Wiltrud Baier",,https://www.kino-zeit.de/node/48852,https://www.kino-zeit.de/sites/default/files/s...,"Böller und Brot GbR , Edition Kassenfeger",Deutschland,,2019
88,2021-11-11,Speer goes to Hollywood,Vanessa Lapa,,https://www.kino-zeit.de/node/53038,https://www.kino-zeit.de/sites/default/files/s...,Salzgeber & Co. Medien GmbH,Israel,,2021
89,2021-11-14,The Human Voice,Pedro Almodóvar,Tilda Swinton,https://www.kino-zeit.de/node/50922,https://www.kino-zeit.de/sites/default/files/s...,StudioCanal,Spanien,,2020
90,2021-11-11,Who's afraid of Alice Miller?,Daniel Howald,,https://www.kino-zeit.de/node/52475,https://www.kino-zeit.de/sites/default/files/s...,Arsenal Filmverleih GmbH,Schweiz,,2020


In [12]:
df['trailer']=recoding_string(df)

Das_Haus__trailer_not,_sold.
Der_Siebzehnte__trailer_deja-vu_film_UG_
Der_wilde_Wald__-_Natur_Natur_sein_lassen__trailer_Mindjazz_Pictures
Die_Jonsson_Bande__trailer_STUDIOCANAL_GmbH_Filmverleih
Die_Verschwundene__trailer_OVALmedia
Ghosts__trailer_Antiheld_Filmverleih
Hakikat__trailer_AF-Media_GmbH
Here_We_Move_Here_We_Groove__trailer_Rise_and_Shine_Cinema
Hinterland__trailer_SquareOne,_Paramount_Pictures_Germany_GmbH
Hochwald__trailer_Salzgeber
Klassenkampf__trailer_Partisan_Filmverleih
Nowhere_Special__trailer_Piffl_Medien_GmbH
Rama_Dama__trailer_CROCO_Filmverleih_&_Vertrieb_GmbH
Tagebuch_einer_Biene__trailer_Filmwelt_Verleihagentur
The_Sparks_Brothers__trailer_Universal_Pictures_International_Germany_GmbH
Titane__trailer_Koch_Films_GmbH
Tochter__trailer_Warner_Bros._Entertainment_GmbH_
Uta__trailer_GMfilms,_barnsteiner-film_
Wonders_of_the_Sea__trailer_Kinostar
Auf_alles,_was_uns_glucklich_macht__trailer_Prokino_Filmverleih,_STUDIOCANAL_GmbH_Filmverleih
Boss_Baby_-_Schluss_mit_Kinde

In [13]:
df

Unnamed: 0,start,film,director,actor,link,image,distro,country,trailer,year
0,2021-10-07,Das Haus,Rick Ostermann,Tobias Moretti Valery Tscheplanowa Lisa Vicari...,https://www.kino-zeit.de/node/52997,https://www.kino-zeit.de/sites/default/files/s...,"not, sold.",Deutschland,https://youtu.be/XZ0DGffS6-Y,2021
1,2021-10-07,Der Siebzehnte,"Saskia Walker, Ralf Walker",Saskia Walker Devid Striesow Ralf Walker,https://www.kino-zeit.de/node/50948,https://www.kino-zeit.de/sites/default/files/s...,déjà-vu film UG,Deutschland,no trailer found,2020
2,2021-10-07,Der wilde Wald - Natur Natur sein lassen,Lisa Eder,,https://www.kino-zeit.de/node/51868,https://www.kino-zeit.de/sites/default/files/s...,Mindjazz Pictures,Deutschland,https://youtu.be/MbHOY6AJvUM,2021
3,2021-10-07,Die Jönsson Bande,Tomas Alfredson,Henrik Dorsin Anders Johansson Hedda Stiernste...,https://www.kino-zeit.de/node/53180,https://www.kino-zeit.de/sites/default/files/s...,STUDIOCANAL GmbH Filmverleih,Schweden,no trailer found,2020
4,2021-10-07,Die Verschwundene,Dominik Moll,Laure Calamy Denis Menochet Valeria Bruni Tede...,https://www.kino-zeit.de/node/48086,https://www.kino-zeit.de/sites/default/files/s...,OVALmedia,"Frankreich, Deutschland",https://youtu.be/o8yXaqoSlOM,2019
...,...,...,...,...,...,...,...,...,...,...
87,2021-11-11,Narren,"Sigrun Köhler, Wiltrud Baier",,https://www.kino-zeit.de/node/48852,https://www.kino-zeit.de/sites/default/files/s...,"Böller und Brot GbR , Edition Kassenfeger",Deutschland,no trailer found,2019
88,2021-11-11,Speer goes to Hollywood,Vanessa Lapa,,https://www.kino-zeit.de/node/53038,https://www.kino-zeit.de/sites/default/files/s...,Salzgeber & Co. Medien GmbH,Israel,https://youtu.be/8MByEViv2L8,2021
89,2021-11-14,The Human Voice,Pedro Almodóvar,Tilda Swinton,https://www.kino-zeit.de/node/50922,https://www.kino-zeit.de/sites/default/files/s...,StudioCanal,Spanien,https://youtu.be/bkv1K9Kr8bw,2020
90,2021-11-11,Who's afraid of Alice Miller?,Daniel Howald,,https://www.kino-zeit.de/node/52475,https://www.kino-zeit.de/sites/default/files/s...,Arsenal Filmverleih GmbH,Schweiz,https://youtu.be/XvfYUXBaF6M,2020


### Subsetting 
films with production country 'Deutschland' and after a certain date

In [14]:
df=df[(df['country'].str.contains('Deutschland'))&(df['start']>'2021-10-01')]


## EXPORT EXCEL + CSV  WITH TIMESTAMP

In [15]:
timestr = time.strftime("%Y%m%d")
df.to_csv(f'exported_files/Filmstarts_{timestr}.csv')
df.to_excel(f'exported_files/Filmstarts_{timestr}.xlsx')

### Option download images

In [16]:
#def img_download(filmtitle, url):
for i in range(len(df)):
    response = requests.get(df.iloc[i][5])
    file = open(f'exported_files/{df.iloc[i][1]}.jpg', 'wb')
    file.write(response.content)
    file.close()

### Option to have resulting Excel sent by email

In [17]:
import os
import smtplib
from email.message import EmailMessage

email_address=os.environ.get('email_user')
email_password=os.environ.get('email_pass')
smtp_ssl_host=os.environ.get('DFA_mailserver')
smtp_ssl_port = 465

msg= EmailMessage()
msg['Subject']= 'Filmstarts sind exportiert! - dein bot'
msg['From']=email_address
msg['To']='spitzer@deutsche-filmakademie.de'
msg.set_content('Hallo, hallo, \n\nHier ein Mailtest aus Jupyter Notebook.\n\nDas Excel mit den aktuellen Filmstarts ist anbei. Gefiltert nach Filmen mit Produktionsland Deutschland. Und Filmstart nach dem 5.November. Die Fotos sind in der Mediathek\n\nschöne Grüße dein Python Bot!')

with open(f'exported_files/Filmstarts_{timestr}.xlsx', 'rb') as attachment:
    file=attachment.read()

msg.add_attachment(file,maintype='Excel',subtype='xlsx', filename=f'Filmstarts_{timestr}.xlsx')

with smtplib.SMTP_SSL(smtp_ssl_host, smtp_ssl_port) as server:
    server.login(email_address, email_password)
    server.send_message(msg)
    server.quit()

### RINGS BELL WHEN SCRIPT FINISHED

In [18]:
#RINGS THE BELL WHEN SCRIPT IS DONE
audio_file = "service-bell.mp3"
return_code = subprocess.call(["afplay", audio_file])

In [19]:
"""
a=[]
for i in range(len(df.actor)):
        if len(df['actor'].loc[i])==2:
            a.append(list(filter(None,df.actor[i][0])))
        else:
           a.append([''])
df['actor_test']=a
#df['actor_test']=listToString(list(df['actor_test']))
df['actor_test']=df['actor_test'].apply(listToString)
df
"""

"\na=[]\nfor i in range(len(df.actor)):\n        if len(df['actor'].loc[i])==2:\n            a.append(list(filter(None,df.actor[i][0])))\n        else:\n           a.append([''])\ndf['actor_test']=a\n#df['actor_test']=listToString(list(df['actor_test']))\ndf['actor_test']=df['actor_test'].apply(listToString)\ndf\n"