# Movies Analysis
### Dataset taken from Kaggle

## Imports

In [117]:
import pandas as pd
import numpy as np

In [118]:
def load_data(loc: str) -> pd.DataFrame:
    """
    :param loc: location of the dataset file
    :return: pandas dataframe loaded with the dataset
    """
    return pd.read_csv(loc)

location = r"C:\Users\Yonat\OneDrive\Desktop\python_projects\movies_analysis\movies.csv"
data = load_data(location)
data.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,


## Changing Columns

In [119]:
# Removing unnecessary columns
to_drop = ['ONE-LINE']
data.drop(columns = to_drop, axis=1, inplace=True)

# Renaming columns
new_names = {'MOVIES': 'TITLE',
             'RunTime': 'DURATION',
             'Gross': 'GROSS'}
data.rename(columns = new_names, inplace= True)

data.head()

Unnamed: 0,TITLE,YEAR,GENRE,RATING,STARS,VOTES,DURATION,GROSS
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,\n Director:\nMatthias Schweighöfer\n| \n ...,,,


## Filling blanks
#### getting a grasp: https://towardsdatascience.com/data-cleaning-with-python-and-pandas-detecting-missing-values-3e9c6ebcf78b
#### methods of dealing with missing data: https://github.com/matthewbrems/ODSC-missing-data-may-18/blob/master/Analysis%20with%20Missing%20Data.pdf

In [122]:
# getting a grasp of the data
cols = ['Column Name', 'Blanks', 'Datatype']
side_df = pd.DataFrame(columns=cols)
side_df['Column Name'] = data.columns

for col in data.columns:
    side_df.loc[side_df['Column Name'] == col, 'Datatype'] = data[col].dtype

    blank_count = data[col].isnull().sum() # Counting null values
    blank_count += data[col].isna().sum()  # Counting nan values
    additional_blanks = data[col].isin(['', ' ', '-', '--']).sum() # Additional conditions for other representations of blanks ('-', '--', etc.)
    blank_count += additional_blanks

    side_df.loc[side_df['Column Name'] == col, 'Blanks'] = blank_count

In [123]:
side_df

Unnamed: 0,Column Name,Blanks,Datatype
0,TITLE,0,object
1,YEAR,1288,object
2,GENRE,160,object
3,RATING,3640,float64
4,STARS,0,object
5,VOTES,3640,object
6,DURATION,5916,float64
7,GROSS,19078,object


In [124]:
# Removing blanks
print("Hello world")

Hello world


## Dividing to TV shows and movies dataframes