### Importing Libraries

In [63]:
# Basic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import cpi #library for inflation-data


# Scraping IMDb
import re
import imdb
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

#df = pd.read_csv('C:/Users/admin1/Documents/GitHub/ds22_project/data/movies.csv')
df = pd.read_csv('./data/movies.csv')

### Functions

#### 1.1 --- check_non_numeric_values

In [64]:
def check_non_numeric_values(df, column):
    """Function takes in dataset and column. No kreturn, Printing out found non numeric values in the column."""

    # convert column to numeric data type
    numeric_col = pd.to_numeric(df[column], errors='coerce')

    # get the non-numeric values and their counts
    non_numeric_values = df[column][numeric_col.isna()].value_counts()

    # check if there are any non-numeric values
    if non_numeric_values.empty:
        print("No non numeric values in that column.")
    else:
        # create a table with non-numeric values and their counts
        non_numeric_table = pd.DataFrame({'Non-Numeric Value': non_numeric_values.index,
                                          'Count': non_numeric_values.values})

        # display the table
        print(non_numeric_table)

#### 1.2 --- get_mean_median_for_column

In [65]:
def get_mean_median_for_column(df, col_name):
    '''
    This function takes in a pandas dataframe and the name of a column in the dataframe,
    and returns the mean and median of the numeric values in the column that are not equal to 0.
    Non-numeric values are converted to 0 before calculating the mean and median.
    
    Parameters:
    - df: pandas dataframe
    - col_name: str, name of column to be processed
    
    Returns:
    - tuple of two floats: mean and median of numeric values in the column that are not equal to 0
    '''
    # Convert non-numeric values to 0
    df[col_name] = pd.to_numeric(df[col_name], errors='coerce').fillna(0)
    
    # Get the non-zero numeric values in the column
    non_zero_vals = df[col_name][df[col_name] != 0]
    
    # Calculate the mean and median of the non-zero values
    col_mean = non_zero_vals.mean()
    col_median = non_zero_vals.median()
    
    return col_mean, col_median


#### 1.3 --- replace_missing_values

In [66]:
def replace_missing_values(df, col_name, stat='mean'):
    '''
    This function takes in a pandas dataframe and the name of a column in the dataframe.
    It drops all rows where the value of the column is 0, and replaces those values with either 
    the mean or median of the rest of the values in the column, as specified by the user.
    It also replaces any NaN values in the column with the same statistic as the missing values.
    
    Parameters:
    - df: pandas dataframe
    - col_name: str, name of column to be processed
    - stat: str, either 'mean' or 'median', determines which statistic to use
    
    Returns:
    - df: pandas dataframe with modified column
    '''
    # Calculate the selected statistic of the non-zero/non-NaN values in the column
    if stat == 'mean':
        stat_val = np.nanmean(df[df[col_name].notnull() & (df[col_name] != 0)][col_name])
    elif stat == 'median':
        stat_val = np.nanmedian(df[df[col_name].notnull() & (df[col_name] != 0)][col_name])
    else:
        raise ValueError("stat must be either 'mean' or 'median'")
    
    # Replace the missing values (0 or NaN) with the selected statistic
    df.loc[(df[col_name] == 0) | (df[col_name].isnull()), col_name] = stat_val
    
    return df

#### 2.1 --- convert_to_usd

In [67]:
def convert_to_usd(amount):
    amount.replace(' ', '')
    amount.replace('\xa0', '')
    if amount.startswith('$'):
        amount = amount.strip('$').replace(',', '')   # must remove commas
        return float(amount)   # convert str into float
    elif amount.startswith('€'):
        # Exchange rate for EUR to USD
        amount = amount.strip('€').replace(',', '')
        return float(amount) * 1.06 
    elif amount.startswith('¥'):
        # Exchange rate for YEN to USD
        amount = amount.strip('¥').replace(',', '')
        return float(amount) * 0.0075
    elif amount.startswith('₹'):
        # Exchange rate for RPL to USD
        amount = amount.strip('₹').replace(',', '')
        return float(amount) * 0.012 
    elif amount.startswith('SEK'):
        # Exchange rate for SEK to USD
        amount = amount.strip('SEK').replace(',', '')
        return float(amount) * 0.094
    elif amount.startswith('DKK'):
        # Exchange rate for RPL to USD
        amount = amount.strip('DKK').replace(',', '')
        return float(amount) * 0.14
    elif amount.startswith('£'):
        # Exchange rate for RPL to USD
        amount = amount.strip('£').replace(',', '')
        return float(amount) * 1.21  
    else:
        return None

#### 2.2 --- adjust_for_inflation

In [68]:
def adjust_for_inflation(df, column_name, year_column, new_column, drop_original=True):
    data = {
        "1990": 5.398,
        "1991": 4.235,
        "1992": 3.0288,
        "1993": 2.9517,
        "1994": 2.6074,
        "1995": 2.8054,
        "1996": 2.9312,
        "1997": 2.3377,
        "1998": 1.5523,
        "1999": 2.188,
        "2000": 3.3769,
        "2001": 2.8262,
        "2002": 1.586,
        "2003": 2.2701,
        "2004": 2.6772,
        "2005": 3.3927,
        "2006": 3.2259,
        "2007": 2.8527,
        "2008": 3.8391,
        "2009": -0.3555,
        "2010": 1.64,
        "2011": 3.1568,
        "2012": 2.0693,
        "2013": 1.4648,
        "2014": 1.6222,
        "2015": 0.1186,
        "2016": 1.2616,
        "2017": 2.1301,
        "2018": 2.4426,
        "2019": 1.8122,
        "2020": 1.2336,
        "2021": 4.6979
    }
    
    # Create a new column in the DataFrame to store the adjusted values
    df[new_column] = 0

    # Loop over the rows in the DataFrame
    for index, row in df.iterrows():
        # Get the year from the row
        year = row[year_column]
        # Skip the row if the value in the specified column is NaN
        if pd.isna(row[column_name]):
            continue
        # Get the inflation rate for each year from the dictionary
        inflation_rates = [data[str(yr)] for yr in range(year, 2022)]
        # Calculate the total inflation factor by multiplying the inflation rates together
        total_inflation_factor = 1
        for rate in inflation_rates:
            total_inflation_factor *= 1 + (rate / 100)
        # Get the value from the specified column
        value = row[column_name]
        # Adjust the value for inflation using the total inflation factor
        adjusted_value = value * total_inflation_factor
        # Round the result to two decimal places and store it in the new column
        df.at[index, new_column] = round(adjusted_value, 2)

    # Drop the original column if specified
    if drop_original:
        df = df.drop(columns=[column_name])

    # Return the DataFrame with the adjusted values
    return df


#### 3.1 --- one_hot_encoding_column

In [69]:
def one_hot_encoding_column(dataset, column, separator=", ", prefix=""):
    """
    Performs one-hot encoding on the specified column of the given dataset.
    dataset: The dataset to be processed.
    column: The name of the column to be one-hot encoded.
    separator: The separator used in the values of the specified column. Defaults to ",".
    prefix: Optional string to be added in front of each new column name. Defaults to "".
    returns: the new dataset with the specified column one-hot encoded.
    """

    # 1. Creating a list with all the values mentioned in the dataset
    value_list = [values.split(separator) for values in dataset[column]]

    # 2. Creating a set with value categories
    unique_v = {value for values in value_list for value in values}

    # 3. Performing one-hot encoding using get_dummies method
    value_subtable = pd.get_dummies(dataset[column].str.split(separator, expand=True).stack()).reset_index(level=1, drop=True)
    value_subtable = value_subtable.groupby(value_subtable.index).sum()

    # 4. Adding the prefix to the column names
    if prefix:
        value_subtable.columns = [prefix + str(col) for col in value_subtable.columns]

    # 5. Merging the subtable with the main dataset
    dataset_processed = pd.merge(dataset, value_subtable, left_index=True, right_index=True, how='left')
    dataset_processed.drop(columns=[column], inplace=True)

    # 6. Returning the new dataset
    return dataset_processed

#### 3.2 --- one_hot_coding_binary

In [70]:
def one_hot_coding_binary(dataset, original_column, prefix, file_column, file_location, separator=", ", num_categories=1, drop_original=True):
    if num_categories not in range(1,5):
        raise ValueError("num_categories must be between 1 and 4")

    for i in range(1, num_categories+1):
        dataset[f"{prefix}_no_{i}"] = dataset[original_column].str.split(separator, expand=True)[i-1]

    dataset_categories = pd.read_csv(file_location)

    for i in range(1, num_categories+1):
        replace = dataset[f"{prefix}_no_{i}"].isin(dataset_categories[file_column])
        dataset[f"{prefix}_no_{i}_binary"] = replace.astype(int)

    if drop_original:
        dataset.drop(columns=[original_column], inplace=True)

    if num_categories == 1:
        dataset.drop(columns=[f"{prefix}_no_1"], inplace=True)
        dataset.rename(columns={f"{prefix}_no_1_binary": f"{prefix}"}, inplace=True)
    else:
        for i in range(1, num_categories+1):
            dataset.drop(columns=[f"{prefix}_no_{i}"], inplace=True)
            dataset.rename(columns={f"{prefix}_no_{i}_binary": f"{prefix}_{i}"}, inplace=True)

        if num_categories == 5:
            dataset.rename(columns={f"{prefix}_all_binary": f"{prefix}_all"}, inplace=True)

    return dataset


#### 4.1 --- convert_imdb_str_to_int

In [71]:
def get_value(val):
    ''' 
    Converts input string to an integer.
    '''
    # 1. Check if the last character of the string is a k or m and if
    # not it converts the string to an int
    if val[-1].lower() not in ['k', 'K', 'm', 'M']:
        return int(val)
    multiplier = val[-1].lower()
    # 2. If the string ends with K multiply by 1000
    if multiplier == "k" or multiplier == "K":
        value = float(val[:-1]) * 1000
        return value
    # 3. If the string ends with M multiply by 1000000 
    elif multiplier == "m" or multiplier == "M":
        value = float(val[:-1]) * 1000000
        return value

#### 4.2 --- scraping_imdb_for_movie_info

In [72]:
def searching_IMDb_movie(title_column, dataset):
    """
    Updating the dataset by adding two new columns for each movie 
    including metascore and number of ratings. 
    
    Parameters:
    name_column : the column containing the person's name
    dataset : dataset
    person_type : must specify for IMDb if person is 'actor'/'director'
    """
    # 1. Instantiating imdb 
    ia = imdb.IMDb()

    # 2. For each row of the df, extract the nam for the specific column
    for index, row in dataset.iterrows():
        try:
            # Get the movie title
            movie_title = row[title_column]
            
            # Search for a movie on IMDb by title
            movie = ia.search_movie(movie_title)
            movie_id = movie[0].getID()            

            # Make a request to the URL and get the HTML content

            # Request for movie (different webpage)
            movie_req = Request(
                url = f'https://www.imdb.com/title/tt{movie_id}/', 
                headers={'User-Agent': 'Mozilla/5.0'}
            )
            movie_html = urlopen(movie_req).read()
            movie_soup = BeautifulSoup(movie_html, 'html.parser')

            # Request for metascore (different webpage)
            meta_req = Request(
                url = f'https://www.imdb.com/title/tt{movie_id}/criticreviews/?ref_=tt_ov_rt', 
                headers={'User-Agent': 'Mozilla/5.0'}
            )
            meta_html = urlopen(meta_req).read()
            meta_soup = BeautifulSoup(meta_html, 'html.parser')
            
            # If metascore exists try to extract info
            try:
                metascore_line = meta_soup.find('div', {'class': 'sc-79ae5a4-0 kUTYKi'}).text.split()
                metascore = get_value(metascore_line[0].split('M')[0])
            except AttributeError:
                metascore = None

            # If number of votes exists try to extract info
            try:
                num_votes_line = movie_soup.find('div', {'class': 'sc-e457ee34-3 frEfSL'}).text.split()
                num_votes = get_value(num_votes_line[0])
            except AttributeError:
                num_votes = None

        except imdb._exceptions.IMDbError:
            metascore = None
            num_votes = None

        # Make a new column and append the new data
        dataset.loc[index, f'{title_column}_metascore'] = metascore
        dataset.loc[index, f'{title_column}_num_votes'] = num_votes
    
        print(index)
    
    return dataset

#### 4.3 --- scraping_imdb_for_person_info

In [73]:
def searching_IMDb_person(name_column, dataset, person_type):
    """
    Updating the dataset by adding new column for a director/actor
    including; number of movies, commulative average rating for these
    movies, number of won awards, number of nominations. 
    
    Parameters:
    name_column : the column containing the person's name
    dataset : dataset
    person_type : specify if person is actor or director due to different
        search filters applied
    """
    # 1. Instantiating imdb 
    ia = imdb.IMDb()

    # 2. For each row of the df, extract the name for specific column
    for index, row in dataset.iterrows():
        try:
            name = row[name_column]
            # Fetch the IMDb name results based on the person's name
            name_search = ia.search_person(name)
            person_id = name_search[0].getID()
            
            # Awards and Nominations
            # Make a request to the URL and get the HTML content
            awards_req = Request(
                url = f'https://www.imdb.com/name/nm{person_id}/awards/?ref_=nm_awd', 
                headers={'User-Agent': 'Mozilla/5.0'}
            )
            awards_html = urlopen(awards_req).read()
            awards_soup = BeautifulSoup(awards_html, 'html.parser')

            # If awards and nominations exist then get data
            try:
                awards_line = awards_soup.find('div', {'class': 'desc'}).text.split()
                wins = get_value(awards_line[2])
                nominations = get_value(awards_line[5])
            except AttributeError:
                wins = None
                nominations = None

            # Movie history
            # Make a request to the URL and get the HTML
            movies_req = Request(
                url = f'https://www.imdb.com/filmosearch/?explore=title_type&role=nm{person_id}&ref_=filmo_ref_job_typ&sort=release_date,asc&mode=detail&page=1&job_type={person_type}', 
                headers={'User-Agent': 'Mozilla/5.0'}
            )
            movie_html = urlopen(movies_req).read()
            movie_soup = BeautifulSoup(movie_html, 'html.parser')

            # If movie history ecists
            try:
                # Make a list of the webpage and create ratings_list
                movies_list = movie_soup.find_all('div', {'class': 'lister-item mode-detail'})
                ratings_list = []
                for movie in movies_list:
                    # For movie extract the year
                    year_raw = movie.find('span', {'class': 'lister-item-year'}).text.strip('()')
                    year_match = re.match(r'^\d{4}$', year_raw)
                    # Only keep movie rating if movie pre-2023
                    if year_match and int(year_raw) < 2023:
                        rating = movie.find('div', {'class': 'ratings-bar'}).find('div', {'class': 'inline-block ratings-imdb-rating'})
                        if rating is not None:
                            # Append rating to ratings_list
                            ratings_list.append(float(rating['data-value']))
            except AttributeError:
                ratings_list = None

        except imdb._exceptions.IMDbError:
            wins = None
            nominations = None
            ratings_list = None

        # Create new columns and append data to appropriate column
        dataset.loc[index, f'{name_column}_num_wins'] = wins
        dataset.loc[index, f'{name_column}_num_nominations'] = nominations
        dataset.loc[index, f'{name_column}_num_movies'] = len(ratings_list)
        dataset.loc[index, f'{name_column}_avg_rating'] = np.mean(ratings_list)

        print(index)

    return dataset

# Dataprocessing

In [74]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Genre,Filming_location,Budget,Income,Country_of_origin
0,Avatar: The Way of Water,7.8,2022,December,PG-13,192,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Action, Adventure, Fantasy",New Zealand,"$350,000,000","$2,267,946,983",United States
1,Guillermo del Toro's Pinocchio,7.6,2022,December,PG,117,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","Animation, Drama, Family",USA,"$35,000,000","$108,967","United States, Mexico, France"


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              2000 non-null   object 
 1   Rating             1999 non-null   float64
 2   Year               2000 non-null   int64  
 3   Month              2000 non-null   object 
 4   Certificate        1966 non-null   object 
 5   Runtime            2000 non-null   object 
 6   Directors          2000 non-null   object 
 7   Stars              2000 non-null   object 
 8   Genre              2000 non-null   object 
 9   Filming_location   2000 non-null   object 
 10  Budget             2000 non-null   object 
 11  Income             2000 non-null   object 
 12  Country_of_origin  2000 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 203.2+ KB


In [76]:
df.describe()

Unnamed: 0,Rating,Year
count,1999.0,2000.0
mean,6.661631,2012.5
std,0.911507,5.767723
min,1.9,2003.0
25%,6.1,2007.75
50%,6.7,2012.5
75%,7.3,2017.25
max,9.0,2022.0


### 1.Title

#### 1.1Unqiue/ Dupclicate values

In [77]:
num_unique_values = df['Title'].nunique()

In [78]:
print(num_unique_values)

1989


My first instict was to expect 2000 unique values for titles of movies.
So I thought to just remove them from the dataset since I expect them to be duplicate data.
But.

In [79]:
duplicates = df[df.duplicated(['Title'], keep=False)].sort_values(by=['Title'])

duplicates

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Genre,Filming_location,Budget,Income,Country_of_origin
1610,Black Christmas,4.6,2006,December,R,95,Glen Morgan,"Michelle Trachtenberg, Mary Elizabeth Winstead...",Horror,Canada,"$9,000,000","$21,510,851","Canada, United States"
348,Black Christmas,3.5,2019,December,PG-13,92,Sophia Takal,"Imogen Poots, Aleyse Shannon, Lily Donoghue, B...","Horror, Mystery, Thriller",New Zealand,"$5,000,000","$18,529,730","United States, New Zealand, Canada"
1746,Fantastic Four,5.7,2005,July,PG-13,106,Tim Story,"Ioan Gruffudd, Michael Chiklis, Chris Evans, J...","Action, Adventure, Fantasy",Canada,"$100,000,000","$333,535,934","Germany, United States"
767,Fantastic Four,4.3,2015,August,PG-13,100,Josh Trank,"Miles Teller, Kate Mara, Michael B Jordan, Jam...","Action, Adventure, Sci-Fi",USA,"$120,000,000","$167,882,881","United States, Germany, United Kingdom"
1263,Frozen,6.2,2010,March,R,93,Adam Green,"Shawn Ashmore, Emma Bell, Kevin Zegers, Ed Ack...","Adventure, Drama, Mystery",USA,Unknown,"$3,843,774",United States
903,Frozen,7.4,2013,November,PG,102,"Chris Buck, Jennifer Lee","Kristen Bell, Idina Menzel, Jonathan Groff, Jo...","Animation, Adventure, Comedy",Norge,"$150,000,000","$1,304,550,716",United States
357,Hellboy,5.2,2019,April,R,120,Neil Marshall,"David Harbour, Milla Jovovich, Ian McShane, Sa...","Action, Adventure, Fantasy",Bulgaria,"$50,000,000","$55,065,289","United States, United Kingdom, Bulgaria, Canad..."
1843,Hellboy,6.8,2004,April,PG-13,122,Guillermo del Toro,"Ron Perlman, Doug Jones, Selma Blair, John Hurt","Action, Adventure, Fantasy",Czech Republic,"$66,000,000","$99,378,985",United States
1905,Oldboy,8.4,2003,November,R,120,Park Chan wook,"Choi Min sik, Yoo Ji tae, Kang Hye jeong, Kim ...","Action, Drama, Mystery",South Korea,"$3,000,000","$15,421,226",South Korea
920,Oldboy,5.8,2013,November,R,104,Spike Lee,"Josh Brolin, Elizabeth Olsen, Samuel L Jackson...","Action, Drama, Mystery",USA,"$30,000,000","$5,186,767",United States


Looking at the duplicate movie titles we can quickly see that its totally diffrent movies that only share the title name and nothing else therefore no duplicate data and we can keep it.
I do expect that we drop this column since we cant make in to numeric.
Well, you could keep it and count the length of the title but I would say that shouldnt have an effect on the model and just be in the way.

### 2.Rating

Rating of the movie, the user rates 0-10 and this it the average of that voting with 1 decimal. This is our y, the data that we want to predict. 

#### 2.1 Missing data
According to the info-function, there is one movie that doesn´t have a value. We cant do more than just drop that one from the dataset.



In [80]:
df = df.dropna(subset=['Rating'])


#A quick check to see that the row was removed from the dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1999 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1999 non-null   object 
 1   Rating             1999 non-null   float64
 2   Year               1999 non-null   int64  
 3   Month              1999 non-null   object 
 4   Certificate        1965 non-null   object 
 5   Runtime            1999 non-null   object 
 6   Directors          1999 non-null   object 
 7   Stars              1999 non-null   object 
 8   Genre              1999 non-null   object 
 9   Filming_location   1999 non-null   object 
 10  Budget             1999 non-null   object 
 11  Income             1999 non-null   object 
 12  Country_of_origin  1999 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 218.6+ KB


Since this is what our model is gonna predict, I dont want to do more here. The type is float64, which tells us that all values is numeric and can be decimal and since imdb is ratings from 0-10 with decimals this seems correct. One thing to consider would be to 10x all the values in ratings and convert the column to an int since that will be quicker to work with.
Actually wann quick check that all the values is between 0 and 10.

#### 2.2 Check the data is within expected range

In [81]:
# Count how many values are between 0 and 10 (inclusive)
column_values = df['Rating']
count = column_values.between(0, 10).sum()

In [82]:
print(count)

1999


Our orginial dataset consistet of 2000 rows and we dropped one with missing values so 1999 was what we expected and hoped for. 


### 3.Year

#### 3.1 Check the data is within expected range

In [83]:
# Count how many values are between 0 and 10 (inclusive)
column_values = df['Year']
count = column_values.between(2002, 2023).sum()

In [84]:
print(count)

1999


The type of the column is int which make sense aswell.

### 4.Month

#### 4.1 Unqiue/ Dupclicate values



In [85]:
column_values = df['Month'].unique()
print(column_values)

['December' 'August' 'November' 'October' 'March' 'September' 'May'
 'April' 'January' 'July' 'June' 'February' '2014' '2008']


Seeing two values that I didnt expect,  2014 and 2018. Start with checking number of times they appear.

In [86]:
# Count the number of times a specific string value occurs in a column
count = df['Month'].value_counts()['2014']
print(count)

1


In [87]:
# Count the number of times a specific string value occurs in a column
count = df['Month'].value_counts()['2008']
print(count)

1


Since they just occurs one time each in the Month-column I suggest we drop them since 2/ 1999 rows one impact our size of dataset especcially much and not seeing it beeing the worth the time to save

In [88]:
df = df.drop(index=df.loc[df['Month'] == '2014'].index)
df = df.drop(index=df.loc[df['Month'] == '2008'].index)

In [89]:
#checking the rows have been dropped
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1997 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1997 non-null   object 
 1   Rating             1997 non-null   float64
 2   Year               1997 non-null   int64  
 3   Month              1997 non-null   object 
 4   Certificate        1964 non-null   object 
 5   Runtime            1997 non-null   object 
 6   Directors          1997 non-null   object 
 7   Stars              1997 non-null   object 
 8   Genre              1997 non-null   object 
 9   Filming_location   1997 non-null   object 
 10  Budget             1997 non-null   object 
 11  Income             1997 non-null   object 
 12  Country_of_origin  1997 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 218.4+ KB


In [90]:
column_values = df['Month'].unique()
print(column_values)

['December' 'August' 'November' 'October' 'March' 'September' 'May'
 'April' 'January' 'July' 'June' 'February']


#### 4.2 Converting non-numeric to numeric values



In [91]:
# Define a dictionary to map months to integers
month_to_int = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
                'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}

# Apply the map() method to convert the values
df['Month'] = df['Month'].map(month_to_int)

# Convert the type of the column to int
df['Month'] = df['Month'].astype(int)

In [92]:
#checking the month-values has been rplaced by 1-12 and the column converted to int
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1997 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1997 non-null   object 
 1   Rating             1997 non-null   float64
 2   Year               1997 non-null   int64  
 3   Month              1997 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1997 non-null   object 
 6   Directors          1997 non-null   object 
 7   Stars              1997 non-null   object 
 8   Genre              1997 non-null   object 
 9   Filming_location   1997 non-null   object 
 10  Budget             1997 non-null   object 
 11  Income             1997 non-null   object 
 12  Country_of_origin  1997 non-null   object 
dtypes: float64(1), int64(2), object(10)
memory usage: 218.4+ KB


In [93]:
column_values = df['Month'].unique()
column_values.sort()
print(column_values)

[ 1  2  3  4  5  6  7  8  9 10 11 12]


### 5.Certificate

In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1997 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1997 non-null   object 
 1   Rating             1997 non-null   float64
 2   Year               1997 non-null   int64  
 3   Month              1997 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1997 non-null   object 
 6   Directors          1997 non-null   object 
 7   Stars              1997 non-null   object 
 8   Genre              1997 non-null   object 
 9   Filming_location   1997 non-null   object 
 10  Budget             1997 non-null   object 
 11  Income             1997 non-null   object 
 12  Country_of_origin  1997 non-null   object 
dtypes: float64(1), int64(2), object(10)
memory usage: 218.4+ KB


In [95]:
### 5.1 Unique values

In [96]:
# Get all unique values of a column
unique_values = df['Certificate'].unique()

In [97]:
print(unique_values)

['PG-13' 'PG' 'R' 'TV-14' 'TV-MA' 'TV-PG' 'TV-Y7' 'Not Rated' nan 'NC-17'
 'TV-G' 'Unrated' 'G']


In [98]:
# Get the count of each unique value in the column, including missing values
value_counts = df['Certificate'].value_counts(dropna=False)

# Print the value counts
print(value_counts)

# Calculate the sum of the counts and print the total
total = value_counts.sum()
print(f'Total: {total}')

R            867
PG-13        712
PG           225
Not Rated     61
TV-MA         40
NaN           33
G             22
TV-14         12
TV-PG          9
NC-17          6
Unrated        6
TV-G           3
TV-Y7          1
Name: Certificate, dtype: int64
Total: 1997


A couple of things. Tree columns stand out.
Not Rated     61   --- probably takes a while from a movie is realsed until it gets rated. Make sense that this mostly consists of movies from the past year
NaN           32   --- not sure how to replace
Unrated        6   --- not sure how to replace

At this point Im not sure how to replace the missing values and w/ ca 100 rows w/ diffrent kind of missing values its a bit much to drop them.
My conclusion is to drop the whole column.


### 6.Runtime

#### 6.1 Check non numeric

In [99]:
#Checking non numeric values
check_non_numeric_values(df, "Runtime")

  Non-Numeric Value  Count
0           Unknown      1


In [100]:
df = df.drop(index=df.loc[df['Runtime'] == 'Unknown'].index)

In [101]:
#check that the row has been dropped
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   object 
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Genre              1996 non-null   object 
 9   Filming_location   1996 non-null   object 
 10  Budget             1996 non-null   object 
 11  Income             1996 non-null   object 
 12  Country_of_origin  1996 non-null   object 
dtypes: float64(1), int64(2), object(10)
memory usage: 218.3+ KB


In [102]:
#converting the type of the column to int
df['Runtime'] = df['Runtime'].astype(int)

In [103]:
#checking the type has been changed
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   int64  
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Genre              1996 non-null   object 
 9   Filming_location   1996 non-null   object 
 10  Budget             1996 non-null   object 
 11  Income             1996 non-null   object 
 12  Country_of_origin  1996 non-null   object 
dtypes: float64(1), int64(3), object(9)
memory usage: 218.3+ KB


check for outliers, since movies should maybe have a range from 30-250min isch.
So if i found values >30 or over 300 min I can assumme that something is wrong and drop them

In [104]:
# Count how many values are between 0 and 10 (inclusive)
column_values = df['Runtime']
count = column_values.between(30, 300).sum()

In [105]:
print(count)

1996


Seems all remaining rows has a value between 30-300min which is good news.

### 7.Directors

In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   int64  
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Genre              1996 non-null   object 
 9   Filming_location   1996 non-null   object 
 10  Budget             1996 non-null   object 
 11  Income             1996 non-null   object 
 12  Country_of_origin  1996 non-null   object 
dtypes: float64(1), int64(3), object(9)
memory usage: 218.3+ KB


In [107]:
#calling the function
#df = one_hot_coding_binary(df, "Directors", "top_50_director", "Name", "C:/Users/admin1/Documents/GitHub/ds22_project/data/top_50_directors.csv", num_categories=4, drop_original=True)
df = one_hot_coding_binary(df, "Directors", "top_50_director", "Name", "./data/top_50_directors.csv", num_categories=2, drop_original=False)


In [108]:
df.head()

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Genre,Filming_location,Budget,Income,Country_of_origin,top_50_director_1,top_50_director_2
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Action, Adventure, Fantasy",New Zealand,"$350,000,000","$2,267,946,983",United States,1,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","Animation, Drama, Family",USA,"$35,000,000","$108,967","United States, Mexico, France",1,0
2,Bullet Train,7.3,2022,8,R,127,David Leitch,"Brad Pitt, Joey King, Aaron Taylor Johnson, Br...","Action, Comedy, Thriller",Japan,"$85,900,000","$239,268,602","Japan, United States",0,0
3,The Banshees of Inisherin,7.8,2022,11,R,114,Martin McDonagh,"Colin Farrell, Brendan Gleeson, Kerry Condon, ...","Comedy, Drama",Ireland,Unknown,"$19,720,823","Ireland, United Kingdom, United States",0,0
4,M3gan,6.4,2022,12,PG-13,102,Gerard Johnstone,"Jenna Davis, Amie Donald, Allison Williams, Vi...","Horror, Sci-Fi, Thriller",New Zealand,"$12,000,000","$171,253,910",United States,0,0


In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   int64  
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Genre              1996 non-null   object 
 9   Filming_location   1996 non-null   object 
 10  Budget             1996 non-null   object 
 11  Income             1996 non-null   object 
 12  Country_of_origin  1996 non-null   object 
 13  top_50_director_1  1996 non-null   int64  
 14  top_50_director_2  1996 non-null   int64  
dtypes: float64(1), int64(5), object(9)
memory usage: 249.5+ KB


In [112]:
#checking sum of each row
director_one = df['top_50_director_1'].sum()
director_two = df['top_50_director_2'].sum()
# director_three = df['top_50_director_3'].sum()
# director_four = df['top_50_director_4'].sum()


#df_directors = pd.DataFrame({'director_one': [director_one], 'director_two': [director_two], 'director_three': [director_three], 'director_four': [director_four]})
df_directors = pd.DataFrame({'director_one': [director_one], 'director_two': [director_two]})

df_directors.head()


Unnamed: 0,director_one,director_two
0,190,7


### 8.Stars

In [113]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Genre,Filming_location,Budget,Income,Country_of_origin,top_50_director_1,top_50_director_2
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Action, Adventure, Fantasy",New Zealand,"$350,000,000","$2,267,946,983",United States,1,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","Animation, Drama, Family",USA,"$35,000,000","$108,967","United States, Mexico, France",1,0


In [114]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1996 entries, 0 to 1999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1996 non-null   object 
 1   Rating             1996 non-null   float64
 2   Year               1996 non-null   int64  
 3   Month              1996 non-null   int64  
 4   Certificate        1964 non-null   object 
 5   Runtime            1996 non-null   int64  
 6   Directors          1996 non-null   object 
 7   Stars              1996 non-null   object 
 8   Genre              1996 non-null   object 
 9   Filming_location   1996 non-null   object 
 10  Budget             1996 non-null   object 
 11  Income             1996 non-null   object 
 12  Country_of_origin  1996 non-null   object 
 13  top_50_director_1  1996 non-null   int64  
 14  top_50_director_2  1996 non-null   int64  
dtypes: float64(1), int64(5), object(9)
memory usage: 249.5+ KB


In [None]:
# Splitting stars

In [115]:
#calling the function
#df = one_hot_coding_binary(df, "Stars", "top_1000_Stars", "Name", "C:/Users/admin1/Documents/GitHub/ds22_project/data/top_1000_actors.csv", num_categories=4, drop_original=True)
df = one_hot_coding_binary(df, "Directors", "top_1000_stars", "Name", "./data/top_1000_actors.csv", num_categories=2, drop_original=False)


In [116]:
df.head(2)

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Genre,Filming_location,Budget,Income,Country_of_origin,top_50_director_1,top_50_director_2,top_1000_stars_1,top_1000_stars_2
0,Avatar: The Way of Water,7.8,2022,12,PG-13,192,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Action, Adventure, Fantasy",New Zealand,"$350,000,000","$2,267,946,983",United States,1,0,0,0
1,Guillermo del Toro's Pinocchio,7.6,2022,12,PG,117,"Guillermo del Toro, Mark Gustafson","Ewan McGregor, David Bradley, Gregory Mann, Bu...","Animation, Drama, Family",USA,"$35,000,000","$108,967","United States, Mexico, France",1,0,0,0


In [None]:
df.info()

### 9.Genre

In [None]:
df.info()

In [None]:
#one hot encoding the column Genre
df = one_hot_encoding_column(df, "Genre", separator=", ", prefix = "")

In [None]:
#checking new dataset
df.info()
df.head(2)

### 10.Filmning_location

In [None]:
df.info()

In [None]:
df.head(2)

In [None]:
# assume that `df` is your pandas DataFrame object
column_values = df['Filming_location'].value_counts().sort_values(ascending=False)
print(column_values)

In [None]:
#Seeing 75 movies with Unknown filming_location. How can we replace them? and seeing 97 unique filming locations.
#We we´re discussing if movies mostly are beeing done w/ green screen.
#maybe remove the whole column?

#one hot encoding the column Filming_location
#df = one_hot_encoding_column(df, "Filming_location", separator=", ", prefix = "")

### 11.Budget

In [None]:
df.head(4)

In [None]:
#1.1 convert to USD and strip of non numeric characters
df['Budget'] = df['Budget'].apply(convert_to_usd)

In [None]:
#check the change
df.head(4)

In [None]:
df.info()

In [None]:
#1.2 --- calc with inflation
# Call the function to get the inflation-adjusted values for the "value" column
df = adjust_for_inflation(df, "Budget", "Year", "Budget_inf", drop_original=True)

In [None]:
#checking the change
df.head(4)

In [None]:
df.info()

In [None]:
#1.3 check the mean and median of the column /value of 0 is not a part of the calculation
get_mean_median_for_column(df, "Budget_inf")

In [None]:
#1.4 replace missing values w/ mean or median
df = replace_missing_values(df, "Budget_inf", stat='median')

In [None]:
#check the changes

In [None]:
df.head(4)

In [None]:
df.info()

### 12.Income

In [None]:
df.head(4)

In [None]:
#1.1 convert to USD and strip of non numeric characters
df['Income'] = df['Income'].apply(convert_to_usd)

In [None]:
df.head(4)

In [None]:
#1.2 --- calc with inflation
# Call the function to get the inflation-adjusted values for the "value" column
df = adjust_for_inflation(df, "Income", "Year", "Income_inf", drop_original=True)

In [None]:
df.head(4)

In [None]:
#1.3 check the mean and median of the column
get_mean_median_for_column(df, "Income_inf")

In [None]:
df.head(4)

In [None]:
#1.4 replace missing values w/ mean or median
df = replace_missing_values(df, "Income_inf", stat='mean')

### 13.Country_of_origin

In [None]:
df.info()

In [None]:
df.head(2)

In [None]:
#one hot encoding the column Genre
df = one_hot_encoding_column(df, "Country_of_origin", separator=", ", prefix = "")

In [None]:
df.info()

In [None]:
df.head(2)

### 14.Scraping IMDb for more data

In [118]:
# 1. Creating a new column 'director' with the first value of 'Directors'
df['director'] = df['Directors'].fillna('').str.split(',', expand=True)[0]

In [127]:
searching_IMDb_person('director', df, 'director')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105


IndexError: list index out of range

In [129]:
df.iloc[105:106, :]

Unnamed: 0,Title,Rating,Year,Month,Certificate,Runtime,Directors,Stars,Genre,Filming_location,...,top_50_director_2,top_1000_stars_1,top_1000_stars_2,director,lead,supporting,director_num_wins,director_num_nominations,director_num_movies,director_avg_rating
106,Love Hard,6.3,2021,11,TV-MA,104,Hern n Jim nez,"Nina Dobrev, Jimmy O Yang, Darren Barnet, Jame...","Comedy, Romance",Canada,...,0,0,0,Hern n Jim nez,Nina Dobrev,Jimmy O Yang,,,,


In [119]:
# 1. Creating a new column 'lead' with the first value of 'Stars'
df['lead'] = df['Stars'].fillna('').str.split(',', expand=True)[0]

# 2. Creating a new column 'supporting' with the second value of 'Stars'
df['supporting'] = df['Stars'].fillna('').str.split(',', expand=True)[1]

In [None]:
searching_IMDb_person('lead', df, 'actor')

In [None]:
searching_IMDb_person('supporting', df, 'actor')