# Movies Analysis
### Dataset taken from Kaggle

## Imports

In [845]:
import pandas as pd
import numpy as np
import regex as re

In [846]:
def load_data(loc: str) -> pd.DataFrame:
    """
    :param loc: location of the dataset file
    :return: pandas dataframe loaded with the dataset
    """
    return pd.read_csv(loc)

location = r"C:\Users\Yonat\OneDrive\Desktop\python_projects\movies_analysis\movies.csv"
data = load_data(location)
data.head(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,
5,Outer Banks,(2020– ),"\nAction, Crime, Drama",7.6,\nA group of teenagers from the wrong side of ...,"\n \n Stars:\nChase Stokes, \nMa...",25858.0,50.0,
6,The Last Letter from Your Lover,(2021),"\nDrama, Romance",6.8,\nA pair of interwoven stories set in the past...,\n Director:\nAugustine Frizzell\n| \n S...,5283.0,110.0,
7,Dexter,(2006–2013),"\nCrime, Drama, Mystery",8.6,"\nBy day, mild-mannered Dexter is a blood-spat...","\n \n Stars:\nMichael C. Hall, \...",665387.0,53.0,
8,Never Have I Ever,(2020– ),\nComedy,7.9,\nThe complicated life of a modern-day first g...,\n \n Stars:\nMaitreyi Ramakrish...,34530.0,30.0,
9,Virgin River,(2019– ),"\nDrama, Romance",7.4,"\nSeeking a fresh start, nurse practitioner Me...",\n \n Stars:\nAlexandra Breckenr...,27279.0,44.0,


## Re-arraging the dataset

In [847]:
# Removing unnecessary columns
to_drop = ['ONE-LINE']
data.drop(columns = to_drop, axis=1, inplace=True)

# Renaming columns
new_names = {'MOVIES': 'TITLE',
             'RunTime': 'DURATION',
             'Gross': 'GROSS'}
data.rename(columns = new_names, inplace= True)
data.head(10)

Unnamed: 0,TITLE,YEAR,GENRE,RATING,STARS,VOTES,DURATION,GROSS
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,\n Director:\nMatthias Schweighöfer\n| \n ...,,,
5,Outer Banks,(2020– ),"\nAction, Crime, Drama",7.6,"\n \n Stars:\nChase Stokes, \nMa...",25858.0,50.0,
6,The Last Letter from Your Lover,(2021),"\nDrama, Romance",6.8,\n Director:\nAugustine Frizzell\n| \n S...,5283.0,110.0,
7,Dexter,(2006–2013),"\nCrime, Drama, Mystery",8.6,"\n \n Stars:\nMichael C. Hall, \...",665387.0,53.0,
8,Never Have I Ever,(2020– ),\nComedy,7.9,\n \n Stars:\nMaitreyi Ramakrish...,34530.0,30.0,
9,Virgin River,(2019– ),"\nDrama, Romance",7.4,\n \n Stars:\nAlexandra Breckenr...,27279.0,44.0,


In [848]:
# Re-arranging director and stars columns
data["STARS"] = data["STARS"].str.replace('Stars', 'Star')
data[['DIRECTORS', 'STAR']] = data['STARS'].str.split('Star:', expand=True)

data.drop(columns = ["STARS", "STAR"], axis=1, inplace=True) # removing stars column
data.drop_duplicates(["TITLE", "YEAR"], keep="first", inplace = True) # dropping duplicates based on year and title, and keeping the first value
data = data.sort_values(by="TITLE").reset_index().drop(columns="index") #sorting by title

# Director
remove_dic = {'\n': '', '|': ''}
data["DIRECTORS"] = data["DIRECTORS"].apply(lambda x: x.translate(str.maketrans(remove_dic)))
data["DIRECTORS"] = data["DIRECTORS"].str.replace('Directors', 'Director')
data["DIRECTORS"] = data["DIRECTORS"].str.replace('Director:', '')

# seperating the directors to a list
separator_map = {", ":"," , " ,": "," , " , ": "," }
data["DIRECTORS"] = data["DIRECTORS"].replace(separator_map, regex = True)
data["DIRECTORS"] = data["DIRECTORS"].str.split(',') # splitting the string to a list
data["DIRECTORS"] = data["DIRECTORS"].apply(lambda x: [s.strip() for s in x]) # remove redundant spaces

In [849]:
# Genre column
data["GENRE"].replace({np.nan: ''}, inplace=True) # replacing nan values with null
data["GENRE"] = data["GENRE"].replace("\n", "", regex = True) # removing the \n
data["GENRE"] = data["GENRE"].str.split(',', regex = True) # splitting the string to a list
data["GENRE"] = data["GENRE"].apply(lambda x: [s.strip() for s in x]) # remove redundant spaces

In [850]:
# marking which row is a movie or a tv-show, based on the year: if it has "-" in it its a tv show
data["LABEL"] = None
data["YEAR"].replace({np.nan: ''}, inplace=True) # replacing nan values with null
data["YEAR"] = [re.sub(r'[^\d–]', '', re.search(r'\(([^)]*)\)', s).group(1)) if re.search(r'\(([^)]*)\)', s) else '' for s in data["YEAR"]] # extract the year
for index, val in data['YEAR'].items():
    if '–' in val: data.loc[index, "LABEL"] = "TV-Show"
    elif val.isdigit(): data.loc[index, "LABEL"] = "Movie"
    elif val == '': data.loc[index, "LABEL"] = ''

In [851]:
# removing all the rows who don't have a year, rating, votes and duration data
data = data[~(data["LABEL"] == '')]
data.head(30)

Unnamed: 0,TITLE,YEAR,GENRE,RATING,VOTES,DURATION,GROSS,DIRECTORS,LABEL
0,13 Reasons Why,2017–2020,"[Drama, Mystery, Thriller]",5.7,1798.0,59.0,,[Russell Mulcahy],TV-Show
1,1899,2022–,"[Drama, History, Horror]",,,,,[Baran bo Odar],TV-Show
2,3Below: Tales of Arcadia,2018–2019,"[Animation, Action, Adventure]",7.9,143.0,22.0,,[Andrew L. Schmidt],TV-Show
3,50M2,2021–,"[Comedy, Drama, Thriller]",6.9,148.0,51.0,,[Selçuk Aydemir],TV-Show
4,7Seeds,2019–2020,"[Animation, Action, Adventure]",6.9,48.0,24.0,,[],TV-Show
5,800 metros,2021–,[Documentary],,,,,[León Siminiani],TV-Show
6,A Suitable Boy,2020–,"[Drama, Romance]",6.1,156.0,58.0,,[Mira Nair],TV-Show
7,"Ada Twist, Scientist",2021–,"[Animation, Adventure, Comedy]",,,,,[],TV-Show
8,After Life,2019–,"[Comedy, Drama]",7.9,1942.0,27.0,,[Ricky Gervais],TV-Show
9,Age of Samurai: Battle for Japan,2021–,"[Documentary, Action, Adventure]",7.4,215.0,43.0,,[Stephen Scott],TV-Show


## Dividing to TV shows and movies dataframes

In [869]:
# getting all the tv shows into a single df
TVs = pd.DataFrame()
TVs = data.loc[data["LABEL"] == "TV-Show"].reset_index().drop(columns=["index"])
TVs["YEAR"] = TVs["YEAR"].str.split('–')
for index, cell in TVs["YEAR"].items():
    if TVs["YEAR"][index][1] == '': TVs["YEAR"][index][1] = 'Running'

TVs[['RELEASE', 'END']] = TVs['YEAR'].apply(pd.Series)
TVs.drop(columns=["YEAR"], inplace = True)
TVs.insert(1, 'RELEASE', TVs.pop('RELEASE'))
TVs.insert(2, 'END', TVs.pop('END'))
TVs.head()

Unnamed: 0,TITLE,RELEASE,END,GENRE,RATING,VOTES,DURATION,GROSS,DIRECTORS,LABEL
0,13 Reasons Why,2017,2020,"[Drama, Mystery, Thriller]",5.7,1798.0,59.0,,[Russell Mulcahy],TV-Show
1,1899,2022,Running,"[Drama, History, Horror]",,,,,[Baran bo Odar],TV-Show
2,3Below: Tales of Arcadia,2018,2019,"[Animation, Action, Adventure]",7.9,143.0,22.0,,[Andrew L. Schmidt],TV-Show
3,50M2,2021,Running,"[Comedy, Drama, Thriller]",6.9,148.0,51.0,,[Selçuk Aydemir],TV-Show
4,7Seeds,2019,2020,"[Animation, Action, Adventure]",6.9,48.0,24.0,,[],TV-Show


In [853]:
# getting all the movies into a single df
Movies = pd.DataFrame()
Movies = data.loc[data["LABEL"] == "Movie"].reset_index().drop(columns=["index"])
Movies.head()

Unnamed: 0,TITLE,YEAR,GENRE,RATING,VOTES,DURATION,GROSS,DIRECTORS,LABEL
0,AlRawabi School for Girls,2021,[Drama],,,,,[Tima Shomali],Movie
1,Alguien tiene que morir,2020,"[Crime, Drama, Thriller]",6.8,204.0,49.0,,[Manolo Caro],Movie
2,Astronomy Club,2019,[Comedy],6.4,38.0,,,"[J.J. Adler, Ryan Anthony Martin]",Movie
3,Astérix,2023,"[Animation, Action, Adventure]",,,,,[],Movie
4,Behind Her Eyes,2021,"[Drama, Mystery, Thriller]",7.0,1129.0,50.0,,[Erik Richter Strand],Movie


## Dealing with blanks

#### getting a grasp: https://towardsdatascience.com/data-cleaning-with-python-and-pandas-detecting-missing-values-3e9c6ebcf78b
#### methods of dealing with missing data: https://github.com/matthewbrems/ODSC-missing-data-may-18/blob/master/Analysis%20with%20Missing%20Data.pdf

In [854]:
## dealing with the blanks
