# WEBSCRAPING MOVIE RELEASE DATES IN GERMANY

### IMPORT LIBRARIES

In [65]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from pprint import pprint
from lxml import html
from lxml.html import fromstring
import urllib.request
from urllib.request import urlopen
import random
import re
import time
import subprocess
from youtube_search import YoutubeSearch
#import scrapy

### SCRAPING TARGET

In [66]:
#URL KINO-ZEIT.DE CINEMA PROGRAM
url='https://www.kino-zeit.de/filme/filmstarts/aktuell'

### SETTING SCOPE
In Germany theatrical releases are typically on Thursdays.
Setting 'weeks' to 1 will return last Thursdays theatrical releases.
Setting 'weeks' to 2 will return last Thursdays and upcoming Thursday's theatrical releases.

In [67]:
#CHOOSING NUMBER OF UPCOMING WEEKS TO SCRAPE
print('How many upcoming weeks would you like to scrape at kino-zeit.de\n---')
weeks=int(input())

How many upcoming weeks would you like to scrape at kino-zeit.de
---
10


### FUNCTIONS

In [68]:
#FUNCTION TO TURN URL INTO HTML
def make_soup(url):
    html=requests.get(url).content
    return BeautifulSoup(html,'html.parser')

#FUNCTION CREATING THE URL TO FLIP TO NEXT WEEKS PAGE
def next_url(soup):
    url=''.join([f'https://www.kino-zeit.de/filme/filmstarts/{i["href"]}' for i in soup.find_all("a",{"class": "next btn btn-primary"})])
    return url

#FUNCTION FLIPPING THROUGH THE WEEKLY OVERVIEWS
#Picking film titles, start dates and the film's detail page
def how_much_soup(url, weeks):
    df_loop=pd.DataFrame(columns=['start','film','director','actor','link','image','distro','country','trailer'])
    lst_start=[]
    lst_film=[]
    lst_link=[]

    for x in range(weeks): #for the number of weeks we are browsing
        
        soup= make_soup(url)
        lst_film.append([i.text for i in soup.find_all(['h3','a'],{'class': 'filmlink'})])
        lst_start.append([i.text for i in soup.find_all('div',{'class':'col-md-2 infospalte-1'})])
        lst_start=re.findall(r'\d\d.\d\d.\d\d\d\d', str(lst_start))
        lst_link.append([f'https://www.kino-zeit.de{i["href"]}' for i in soup.find_all(["h3","a"],{"class": "filmlink"})])
        url=next_url(soup) #calling the function for the url we flip to for the following week
    
    lst_film=[i for sublst in lst_film for i in sublst]
    df_loop['film']=lst_film
    df_loop['year']=df_loop['film'].str.findall('\d\d\d\d')
    df_loop['start']=lst_start
    df_loop['start'] = pd.to_datetime(df_loop['start'])
    lst_link= [i for sublst in lst_link for i in sublst]
    df_loop['link']=lst_link
    return (df_loop)

#FUNCTION GETTING INFOR FORM FILM DETAIL PAGES
def filmdetails(df, link):
    #CREATING OVERVIEW
    regie_list=[]
    darsteller_list=[]
    list_img_url=[]
    list_verleih=[]
    list_prodland=[]
    
    #BROWSING THE FILMS' INDIVIDUAL PAGES FOR FURTHER DETAILS 
    for i in link:
        sub_page=make_soup(i)
        regie=[i.text.replace('\nRegie','').replace('\n\n','').split(',') for i in sub_page.find_all('div',{'class':'field field--name-field-regie field--type-entity-reference field--label-above'})]
        regie_list.append(regie[0])
        darsteller=[i.text.replace('\nDarsteller','').split('\n') for i in sub_page.find_all('div',{'class':'field field--name-field-hauptdarsteller field--type-entity-reference field--label-above'})]
        darsteller_list.append(darsteller)
        img_url=[i['data-big'].split('?') for i in sub_page.find_all('img',{'id':'headerbildbig'})]
        list_img_url.append(i[0] for i in img_url)
        verleih=[i.text.replace('\nFilmverleih','').replace('\n\n','') for i in sub_page.find_all(['div','a'],{'class':'field field--name-field-distribution-film field--type-entity-reference field--label-above'})]
        list_verleih.append(verleih)
        prodland=[i.text.replace('\nProduktionsland','').replace('\n\n','') for i in sub_page.find_all(['div','a'],{'class':'field field--name-field-produktionsland field--type-entity-reference field--label-above'})]
        list_prodland.append(prodland)

    #FILLING THE DATAFRAME WITH THOSE NEW DATA
    df['director']=regie_list
    df['actor']=darsteller_list
    df['image']=list_img_url
    df['distro']=list_verleih
    df['country']=list_prodland
        
    #CLEAN ACTORS COLUMN
    for i in range(len(df.actor)):
        if len(df['actor'].loc[i])==2:
            df['actor'].loc[i]=df['actor'].loc[i][0]
            df['actor'].loc[i]=list(filter(lambda x: x, list(df['actor'].loc[i])))
    
    #TURN LISTS INTO STRINGS TO CLEAN THEM UP
    for i in range(len(df)):
        df['film']=df['film'].str.replace('\(\d\d\d\d\)','')
        df['country'][i]=''.join(df['country'][i]).replace('\n',', ')
        df['director'][i]=', '.join(df['director'][i]).replace('\n',', ')
        df['actor'][i]=', '.join(df['actor'][i])
        df['image'][i]=''.join(df['image'][i])
        df['distro'][i]=''.join(df['distro'][i]).replace('\n',', ')
    return df

###YOUTUBE TRAILER
#FUNCTION SEARCHING YOUTUBE RETURNING A YOUTUBE LINK
def youtube_link (film, director, distro):
    results = YoutubeSearch(f'{film} {director} {distro} Trailer', max_results=1).to_dict() #returns a dictionar
    if results ==[]:
        return('couldnt find youtube trailer')
    else:
        suffix = results[0]['url_suffix']
        youtube_link= f'www.youtube.com{suffix}'
        return(youtube_link)

# START PROGRAM

In [69]:
#STARTIN PROGRAM
df=how_much_soup(url, weeks)
df=filmdetails(df, df['link'])
   
# CALL FUNCTION FOR YOUTUBE TRAILER
df['trailer']='-'
for i in range(len(df)):
    df['trailer'].loc[i] = youtube_link(df['film'][i], df['director'][i],df['distro'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['country'][i]=''.join(df['country'][i]).replace('\n',', ')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['director'][i]=', '.join(df['director'][i]).replace('\n',', ')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

## EXPORT EXCEL + CSV  WITH TIMESTAMP

In [77]:
timestr = time.strftime("%Y%m%d")
df.to_csv(f'exported_files/Filmstarts_{timestr}.csv')
df.to_excel(f'exported_files/Filmstarts_{timestr}.xlsx')

### RINGS BELL WHEN SCRIPT FINISHED

In [71]:
#RINGS THE BELL WHEN SCRIPT IS DONE
audio_file = "service-bell.mp3"
return_code = subprocess.call(["afplay", audio_file])

### Subsetting 
films with production country 'Deutschland' and after a certain date

In [72]:
df=df[(df['country'].str.contains('Deutschland'))&(df['start']>'2020-11-05')]
df

Unnamed: 0,start,film,director,actor,link,image,distro,country,trailer,year
46,2020-11-11,Kehraus,Hanns Christian Müller,"Gerhard Polt, Nikolaus Paryla, Jochen Busse, G...",https://www.kino-zeit.de/node/32576,https://www.kino-zeit.de/sites/default/files/s...,CROCO Filmverleih & Vertrieb GmbH,Deutschland,www.youtube.com/watch?v=dJolngEBM7s,[1983]
59,2020-12-11,Freistaat Mittelpunkt,Kai Ehlers,,https://www.kino-zeit.de/node/50794,https://www.kino-zeit.de/sites/default/files/s...,Kai Ehlers Film,Deutschland,www.youtube.com/watch?v=LIXdl4H-IxI,[2019]
60,2020-12-11,Kaiserschmarrndrama,Ed Herzog,"Sebastian Bezzel, Simon Schwarz, Eisi Gulp",https://www.kino-zeit.de/node/50160,https://www.kino-zeit.de/sites/default/files/s...,Constantin Filmverleih,Deutschland,www.youtube.com/watch?v=4b2Dmkqq1Ok,[2020]
63,2020-12-11,Narren,"Sigrun Köhler, Wiltrud Baier",,https://www.kino-zeit.de/node/48852,https://www.kino-zeit.de/sites/default/files/s...,"Böller und Brot GbR , Edition Kassenfeger",Deutschland,couldnt find youtube trailer,[2019]
64,2020-12-11,Neubau,Johannes Maria Schmit,"Tucké Royale, Monika Zimmering, Jalda Rebling,...",https://www.kino-zeit.de/node/48453,https://www.kino-zeit.de/sites/default/files/s...,Salzgeber,Deutschland,www.youtube.com/watch?v=2eX64gJp1SQ,[2020]
65,2020-12-11,Now,Jim Rakete,,https://www.kino-zeit.de/node/49783,https://www.kino-zeit.de/sites/default/files/s...,W-Film,Deutschland,www.youtube.com/watch?v=_ywTUm5dCt0,[2020]
70,2020-11-19,Das perfekte Schwarz,Tom Fröhlich,,https://www.kino-zeit.de/node/48853,https://www.kino-zeit.de/sites/default/files/s...,Film Kino Text,Deutschland,www.youtube.com/watch?v=wUp1UZwo6JY,[2018]
72,2020-11-19,Die Adern der Welt,Byambasuren Davaa,"Bat-Ireedui Batmunkh, Enerel Tumen, Yalalt Nam...",https://www.kino-zeit.de/node/48742,https://www.kino-zeit.de/sites/default/files/s...,Pandora Film Verleih,"Mongolei, Deutschland",www.youtube.com/watch?v=kDP5XGZAxV0,[2020]
73,2020-11-19,Die Unbeugsamen,Torsten Körner,,https://www.kino-zeit.de/node/48315,https://www.kino-zeit.de/sites/default/files/s...,Majestic Filmverleih,Deutschland,www.youtube.com/watch?v=yLjAayYEgOQ,[2020]
74,2020-11-19,Die Wand der Schatten,Eliza Kubarska,,https://www.kino-zeit.de/node/50593,https://www.kino-zeit.de/sites/default/files/s...,Rise and Shine Cinema,"Schweiz, Polen, Deutschland",www.youtube.com/watch?v=kc4mJt6DYgc,[2020]


### Option download images

In [76]:
#def img_download(filmtitle, url):
for i in range(len(df)):
    response = requests.get(df.iloc[i][5])
    file = open(f'exported_files/{df.iloc[i][1]}.jpg', 'wb')
    file.write(response.content)
    file.close()

### Option to have resulting Excel sent by email

In [79]:
import os
import smtplib
from email.message import EmailMessage

email_address=os.environ.get('email_user')
email_password=os.environ.get('email_pass')
smtp_ssl_host=os.environ.get('DFA_mailserver')
smtp_ssl_port = 465

msg= EmailMessage()
msg['Subject']= 'Filmstarts sind exportiert! - dein bot'
msg['From']=email_address
msg['To']=''
msg.set_content('Hallo, hallo, \n\nDas Excel mit den aktuellen Filmstarts ist anbei.\n\nschöne Grüße dein Python Bot!')

with open(f'exported_files/Filmstarts_{timestr}.xlsx', 'rb') as attachment:
    file=attachment.read()

msg.add_attachment(file,maintype='Excel',subtype='xlsx', filename=f'Filmstarts_{timestr}.xlsx')

with smtplib.SMTP_SSL(smtp_ssl_host, smtp_ssl_port) as server:
    server.login(email_address, email_password)
    server.send_message(msg)
    server.quit()