# WEBSCRAPING MOVIE RELEASE DATES IN GERMANY

### IMPORT LIBRARIES

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from pprint import pprint
from lxml import html
from lxml.html import fromstring
import urllib.request
from urllib.request import urlopen
import random
import re
import time
import subprocess
from datetime import datetime
import urllib.request
import unidecode
#import scrapy

### SCRAPING TARGET

In [2]:
#URL KINO-ZEIT.DE CINEMA PROGRAM
url='https://www.kino-zeit.de/filme/filmstarts/aktuell'

### SETTING SCOPE
In Germany theatrical releases are typically on Thursdays.
Setting 'weeks' to 1 will return last Thursdays theatrical releases.
Setting 'weeks' to 2 will return last Thursdays and upcoming Thursday's theatrical releases.

In [3]:
#CHOOSING NUMBER OF UPCOMING WEEKS TO SCRAPE
print('How many upcoming weeks would you like to scrape at kino-zeit.de\n---')
weeks=int(input())

How many upcoming weeks would you like to scrape at kino-zeit.de
---
7


### FUNCTIONS

In [4]:
#FUNCTION TO TURN URL INTO HTML
def make_soup(url):
    html=requests.get(url).content
    return BeautifulSoup(html,'html.parser')

#FUNCTION CREATING THE URL TO FLIP TO NEXT WEEKS PAGE
def next_url(soup):
    url=''.join([f'https://www.kino-zeit.de/filme/filmstarts/{i["href"]}' for i in soup.find_all("a",{"class": "next btn btn-primary"})])
    return url

#FUNCTION FLIPPING THROUGH THE WEEKLY OVERVIEWS
#Picking film titles, start dates and the film's detail page
def how_much_soup(url, weeks):
    df_loop=pd.DataFrame(columns=['start','film','director','actor','link','image','distro','country','trailer'])
    lst_start=[]
    lst_film=[]
    lst_link=[]

    for x in range(weeks): #for the number of weeks we are browsing
        
        soup= make_soup(url)
        lst_film.append([i.text for i in soup.find_all(['h3','a'],{'class': 'filmlink'})])
        lst_start.append([i.text for i in soup.find_all('div',{'class':'col-md-2 infospalte-1'})])
        lst_start=re.findall(r'\d\d.\d\d.\d\d\d\d', str(lst_start))
        lst_link.append([f'https://www.kino-zeit.de{i["href"]}' for i in soup.find_all(["h3","a"],{"class": "filmlink"})])
        url=next_url(soup) #calling the function for the url we flip to for the following week
    
    lst_film=[i for sublst in lst_film for i in sublst]
    df_loop['film']=lst_film
    df_loop['year']=df_loop['film'].str.findall('\d\d\d\d')
    df_loop['year']=df_loop['year'].apply(listToString)
    #df_loop['start']=lst_start
    lst_start=[datetime.strptime(i,'%d.%m.%Y') for i in lst_start]
    df_loop['start']=lst_start
    lst_link= [i for sublst in lst_link for i in sublst]
    df_loop['link']=lst_link
    return (df_loop)

In [5]:
# FUNCTION TO TURN LIST TO STRING
#NEEDED FOR CLEANING UP ACTORS' COLUMN
def listToString(s):  
    
    # initialize an empty string 
    str1 = " " 
    
    # return string   
    return (str1.join(s)) 

In [6]:
#FUNCTION GETTING INFOR FORM FILM DETAIL PAGES
def filmdetails(df, link):
    #CREATING OVERVIEW
    regie_list=[]
    darsteller_list=[]
    list_img_url=[]
    list_verleih=[]
    list_prodland=[]
    
    #BROWSING THE FILMS' INDIVIDUAL PAGES FOR FURTHER DETAILS 
    for i in link:
        sub_page=make_soup(i)
        regie=[i.text.replace('\nRegie','').replace('\n\n','').split(',') for i in sub_page.find_all('div',{'class':'field field--name-field-regie field--type-entity-reference field--label-above'})]
        regie_list.append(regie[0])
        darsteller=[i.text.replace('\nDarsteller','').split('\n') for i in sub_page.find_all('div',{'class':'field field--name-field-hauptdarsteller field--type-entity-reference field--label-above'})]
        darsteller_list.append(darsteller)
        img_url=[i['data-big'].split('?') for i in sub_page.find_all('img',{'id':'headerbildbig'})]
        list_img_url.append(i[0] for i in img_url)
        verleih=[i.text.replace('\nFilmverleih','').replace('\n\n','') for i in sub_page.find_all(['div','a'],{'class':'field field--name-field-distribution-film field--type-entity-reference field--label-above'})]
        list_verleih.append(verleih)
        prodland=[i.text.replace('\nProduktionsland','').replace('\n\n','') for i in sub_page.find_all(['div','a'],{'class':'field field--name-field-produktionsland field--type-entity-reference field--label-above'})]
        list_prodland.append(prodland)

    #FILLING THE DATAFRAME WITH THOSE NEW DATA
    df['director']=regie_list
    df['actor']=darsteller_list
    df['image']=list_img_url
    df['distro']=list_verleih
    df['country']=list_prodland
    
    #CLEAN ACTORS COLUMN
    a=[]
    for i in range(len(df.actor)):
        if len(df['actor'].loc[i])==2:
            a.append(list(filter(None,df.actor[i][0])))
        else:
            a.append([''])
        
    df['actor']=a
    df['actor']=df['actor'].apply(listToString)

    #TURN LISTS INTO STRINGS TO CLEAN THEM UP
    for i in range(len(df)):
        
        df['film']=df['film'].str.replace('\(\d\d\d\d\)','')
        df['country'][i]=''.join(df['country'][i]).replace('\n',', ')
        df['director'][i]=', '.join(df['director'][i]).replace('\n',', ')
        df['image'][i]=''.join(df['image'][i])
        df['distro'][i]=''.join(df['distro'][i]).replace('\n',', ')
    return df

In [7]:
### DEALING WITH UMLAUT CHARACTERS
### CURRENTLY NOT IN USE
"""def remove_odd_characters(string):
    
    u = 'ü'.encode()
    U = 'Ü'.encode()
    a = 'ä'.encode()
    A = 'Ä'.encode()
    o = 'ö'.encode()
    O = 'Ö'.encode()
    ss = 'ß'.encode()
    e = 'é' .encode()
    e = 'è' .encode()

    string = string.encode()
    string = string.replace(u, b'ue')
    string = string.replace(U, b'Ue')
    string = string.replace(a, b'ae')
    string = string.replace(A, b'Ae')
    string = string.replace(o, b'oe')
    string = string.replace(O, b'Oe')
    string = string.replace(ss, b'ss')
    string = string.replace(e, b'e')

    string = string.decode('utf-8')
    return string"""

"def remove_odd_characters(string):\n    \n    u = 'ü'.encode()\n    U = 'Ü'.encode()\n    a = 'ä'.encode()\n    A = 'Ä'.encode()\n    o = 'ö'.encode()\n    O = 'Ö'.encode()\n    ss = 'ß'.encode()\n    e = 'é' .encode()\n    e = 'è' .encode()\n\n    string = string.encode()\n    string = string.replace(u, b'ue')\n    string = string.replace(U, b'Ue')\n    string = string.replace(a, b'ae')\n    string = string.replace(A, b'Ae')\n    string = string.replace(o, b'oe')\n    string = string.replace(O, b'Oe')\n    string = string.replace(ss, b'ss')\n    string = string.replace(e, b'e')\n\n    string = string.decode('utf-8')\n    return string"

In [8]:
### DEALING WITH FOREIGN LANGUAGE CHARACTERS

def recoding_string (df):
    trailer_list=[]
    for i in range(len(df)):
        search= f"{df['film'][i].replace(' ','_')}_trailer_{df['distro'][i].replace(' ','_')}"
        #search=remove_odd_characters(search) --- calls function remove umlaut
        search=unidecode.unidecode(search) # dealing with foreign language letters so we can include them in youtube search URL
        print(search)
        html = urllib.request.urlopen(f"https://www.youtube.com/results?search_query={search}")
        video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
        if video_ids == []:
            trailer_list.append('no trailer found')
        else:
            youtube_url= f'https://youtu.be/{video_ids[0]}'
            trailer_list.append(youtube_url)
    return trailer_list

# START PROGRAM

In [9]:
#CALL FUNCTION FILM OVERVIEW
df=how_much_soup(url, weeks)
df


Unnamed: 0,start,film,director,actor,link,image,distro,country,trailer,year
0,2020-12-17,Das Neue Evangelium (2020),,,https://www.kino-zeit.de/node/51008,,,,,2020
1,2020-12-17,Die Wand der Schatten (2020),,,https://www.kino-zeit.de/node/50593,,,,,2020
2,2020-12-23,Wonder Woman 1984 (2020),,,https://www.kino-zeit.de/node/48068,,,,,1984 2020
3,2020-12-24,Bis an die Grenze (2020),,,https://www.kino-zeit.de/node/48787,,,,,2020
4,2020-12-31,Wickie und die starken Männer - Das magische S...,,,https://www.kino-zeit.de/node/48473,,,,,2019
5,2021-01-07,Chaddr - Unter uns der Fluss (2020),,,https://www.kino-zeit.de/node/50713,,,,,2020
6,2021-01-07,Krabat (2008),,,https://www.kino-zeit.de/node/16389,,,,,2008
7,2021-01-07,Neues aus der Welt (2020),,,https://www.kino-zeit.de/node/50909,,,,,2020
8,2021-01-07,Tagundnachtgleiche (2020),,,https://www.kino-zeit.de/node/48415,,,,,2020
9,2021-01-07,Zimmer 212 - In einer magischen Nacht (2019),,,https://www.kino-zeit.de/node/46391,,,,,2019


In [10]:
#CALL FUNCTION FILM DETAILS
df=filmdetails(df, df['link'])
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['country'][i]=''.join(df['country'][i]).replace('\n',', ')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['director'][i]=', '.join(df['director'][i]).replace('\n',', ')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['image'][i]=''.join(df['image'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d

Unnamed: 0,start,film,director,actor,link,image,distro,country,trailer,year
0,2020-12-17,Das Neue Evangelium,Milo Rau,Yvan Sagnet Maia Morgenstern Enrique Irazoqui ...,https://www.kino-zeit.de/node/51008,https://www.kino-zeit.de/sites/default/files/s...,Port au Prince Pictures,"Deutschland, Schweiz",,2020
1,2020-12-17,Die Wand der Schatten,Eliza Kubarska,,https://www.kino-zeit.de/node/50593,https://www.kino-zeit.de/sites/default/files/s...,Rise and Shine Cinema,"Schweiz, Polen, Deutschland",,2020
2,2020-12-23,Wonder Woman 1984,Patty Jenkins,Gal Gadot Chris Pine Kristen Wiig Pedro Pascal...,https://www.kino-zeit.de/node/48068,https://www.kino-zeit.de/sites/default/files/s...,Warner Bros. Entertainment GmbH,USA,,1984 2020
3,2020-12-24,Bis an die Grenze,Anne Fontaine,Virginie Efira Omar Sy Grégory Gadebois Grégor...,https://www.kino-zeit.de/node/48787,https://www.kino-zeit.de/sites/default/files/s...,STUDIOCANAL GmbH Filmverleih,Frankreich,,2020
4,2020-12-31,Wickie und die starken Männer - Das magische S...,Eric Cazes,,https://www.kino-zeit.de/node/48473,https://www.kino-zeit.de/sites/default/files/s...,Leonine,"Deutschland, Frankreich, Belgien",,2019
5,2021-01-07,Chaddr - Unter uns der Fluss,Minsu Park,,https://www.kino-zeit.de/node/50713,https://www.kino-zeit.de/sites/default/files/s...,Film Kino Text,Deutschland,,2020
6,2021-01-07,Krabat,Marco Kreuzpaintner,David Kross Daniel Brühl Christian Redl Robert...,https://www.kino-zeit.de/node/16389,https://www.kino-zeit.de/sites/default/files/s...,CROCO Filmverleih & Vertrieb GmbH,Deutschland,,2008
7,2021-01-07,Neues aus der Welt,Paul Greengrass,Tom Hanks Helena Zengel Elizabeth Marvel Mare ...,https://www.kino-zeit.de/node/50909,https://www.kino-zeit.de/sites/default/files/s...,Universal Pictures International Germany GmbH,USA,,2020
8,2021-01-07,Tagundnachtgleiche,Lena Knauss,Thomas Niehaus Sarah Hostettler Aenne Schwarz ...,https://www.kino-zeit.de/node/48415,https://www.kino-zeit.de/sites/default/files/s...,farbfilm verleih GmbH,Deutschland,,2020
9,2021-01-07,Zimmer 212 - In einer magischen Nacht,Christophe Honoré,Chiara Mastroianni Carole Bouquet Camille Cott...,https://www.kino-zeit.de/node/46391,https://www.kino-zeit.de/sites/default/files/s...,Olymp Film,"Frankreich, Belgien",,2019


In [11]:
df['trailer']=recoding_string(df)

Das_Neue_Evangelium__trailer_Port_au_Prince_Pictures
Die_Wand_der_Schatten__trailer_Rise_and_Shine_Cinema
Wonder_Woman_1984__trailer_Warner_Bros._Entertainment_GmbH_
Bis_an_die_Grenze__trailer_STUDIOCANAL_GmbH_Filmverleih
Wickie_und_die_starken_Manner_-_Das_magische_Schwert__trailer_Leonine
Chaddr_-_Unter_uns_der_Fluss__trailer_Film_Kino_Text
Krabat__trailer_CROCO_Filmverleih_&_Vertrieb_GmbH
Neues_aus_der_Welt__trailer_Universal_Pictures_International_Germany_GmbH
Tagundnachtgleiche__trailer_farbfilm_verleih_GmbH
Zimmer_212_-_In_einer_magischen_Nacht__trailer_Olymp_Film
Cats_&_Dogs_3_-_Pfoten_vereint!__trailer_Warner_Bros.
Freaky__trailer_Universal_Pictures_International_Germany
Martin_Eden__trailer_Piffl_Medien_GmbH
Rosas_Hochzeit__trailer_Piffl_Medien_GmbH
Superintelligence__trailer_Warner_Bros.
Moffie__trailer_Salzgeber
Proxima_-_Die_Astronautin__trailer_Koch_Films_GmbH
Resistance_-_Widerstand__trailer_Warner_Bros._Entertainment_GmbH_
The_Trouble_with_Being_Born__trailer_eksystent_d

In [12]:
df

Unnamed: 0,start,film,director,actor,link,image,distro,country,trailer,year
0,2020-12-17,Das Neue Evangelium,Milo Rau,Yvan Sagnet Maia Morgenstern Enrique Irazoqui ...,https://www.kino-zeit.de/node/51008,https://www.kino-zeit.de/sites/default/files/s...,Port au Prince Pictures,"Deutschland, Schweiz",https://youtu.be/jDsg6cvZmLM,2020
1,2020-12-17,Die Wand der Schatten,Eliza Kubarska,,https://www.kino-zeit.de/node/50593,https://www.kino-zeit.de/sites/default/files/s...,Rise and Shine Cinema,"Schweiz, Polen, Deutschland",no trailer found,2020
2,2020-12-23,Wonder Woman 1984,Patty Jenkins,Gal Gadot Chris Pine Kristen Wiig Pedro Pascal...,https://www.kino-zeit.de/node/48068,https://www.kino-zeit.de/sites/default/files/s...,Warner Bros. Entertainment GmbH,USA,https://youtu.be/XW2E2Fnh52w,1984 2020
3,2020-12-24,Bis an die Grenze,Anne Fontaine,Virginie Efira Omar Sy Grégory Gadebois Grégor...,https://www.kino-zeit.de/node/48787,https://www.kino-zeit.de/sites/default/files/s...,STUDIOCANAL GmbH Filmverleih,Frankreich,https://youtu.be/EKyp_uJncVQ,2020
4,2020-12-31,Wickie und die starken Männer - Das magische S...,Eric Cazes,,https://www.kino-zeit.de/node/48473,https://www.kino-zeit.de/sites/default/files/s...,Leonine,"Deutschland, Frankreich, Belgien",https://youtu.be/WEhvBitBLfg,2019
5,2021-01-07,Chaddr - Unter uns der Fluss,Minsu Park,,https://www.kino-zeit.de/node/50713,https://www.kino-zeit.de/sites/default/files/s...,Film Kino Text,Deutschland,no trailer found,2020
6,2021-01-07,Krabat,Marco Kreuzpaintner,David Kross Daniel Brühl Christian Redl Robert...,https://www.kino-zeit.de/node/16389,https://www.kino-zeit.de/sites/default/files/s...,CROCO Filmverleih & Vertrieb GmbH,Deutschland,https://youtu.be/sUd43YtS20Y,2008
7,2021-01-07,Neues aus der Welt,Paul Greengrass,Tom Hanks Helena Zengel Elizabeth Marvel Mare ...,https://www.kino-zeit.de/node/50909,https://www.kino-zeit.de/sites/default/files/s...,Universal Pictures International Germany GmbH,USA,https://youtu.be/1TB5GNhtgRQ,2020
8,2021-01-07,Tagundnachtgleiche,Lena Knauss,Thomas Niehaus Sarah Hostettler Aenne Schwarz ...,https://www.kino-zeit.de/node/48415,https://www.kino-zeit.de/sites/default/files/s...,farbfilm verleih GmbH,Deutschland,https://youtu.be/Hxu_a4dRKDE,2020
9,2021-01-07,Zimmer 212 - In einer magischen Nacht,Christophe Honoré,Chiara Mastroianni Carole Bouquet Camille Cott...,https://www.kino-zeit.de/node/46391,https://www.kino-zeit.de/sites/default/files/s...,Olymp Film,"Frankreich, Belgien",no trailer found,2019


### Subsetting 
films with production country 'Deutschland' and after a certain date

In [13]:
df=df[(df['country'].str.contains('Deutschland'))&(df['start']>'2020-11-05')]


## EXPORT EXCEL + CSV  WITH TIMESTAMP

In [14]:
timestr = time.strftime("%Y%m%d")
df.to_csv(f'exported_files/Filmstarts_{timestr}.csv')
df.to_excel(f'exported_files/Filmstarts_{timestr}.xlsx')

### Option download images

In [15]:
#def img_download(filmtitle, url):
for i in range(len(df)):
    response = requests.get(df.iloc[i][5])
    file = open(f'exported_files/{df.iloc[i][1]}.jpg', 'wb')
    file.write(response.content)
    file.close()

### Option to have resulting Excel sent by email

In [16]:
import os
import smtplib
from email.message import EmailMessage

email_address=os.environ.get('email_user')
email_password=os.environ.get('email_pass')
smtp_ssl_host=os.environ.get('DFA_mailserver')
smtp_ssl_port = 465

msg= EmailMessage()
msg['Subject']= 'Filmstarts sind exportiert! - dein bot'
msg['From']=email_address
msg['To']='coyew69768@95ta.com'
msg.set_content('Hallo, hallo, \n\nHier ein Mailtest aus Jupyter Notebook.\n\nDas Excel mit den aktuellen Filmstarts ist anbei. Gefiltert nach Filmen mit Produktionsland Deutschland. Und Filmstart nach dem 5.November. Die Fotos sind in der Mediathek\n\nschöne Grüße dein Python Bot!')

with open(f'exported_files/Filmstarts_{timestr}.xlsx', 'rb') as attachment:
    file=attachment.read()

msg.add_attachment(file,maintype='Excel',subtype='xlsx', filename=f'Filmstarts_{timestr}.xlsx')

with smtplib.SMTP_SSL(smtp_ssl_host, smtp_ssl_port) as server:
    server.login(email_address, email_password)
    server.send_message(msg)
    server.quit()

### RINGS BELL WHEN SCRIPT FINISHED

In [17]:
#RINGS THE BELL WHEN SCRIPT IS DONE
audio_file = "service-bell.mp3"
return_code = subprocess.call(["afplay", audio_file])

In [18]:
"""
a=[]
for i in range(len(df.actor)):
        if len(df['actor'].loc[i])==2:
            a.append(list(filter(None,df.actor[i][0])))
        else:
           a.append([''])
df['actor_test']=a
#df['actor_test']=listToString(list(df['actor_test']))
df['actor_test']=df['actor_test'].apply(listToString)
df
"""

"\na=[]\nfor i in range(len(df.actor)):\n        if len(df['actor'].loc[i])==2:\n            a.append(list(filter(None,df.actor[i][0])))\n        else:\n           a.append([''])\ndf['actor_test']=a\n#df['actor_test']=listToString(list(df['actor_test']))\ndf['actor_test']=df['actor_test'].apply(listToString)\ndf\n"