In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# 1 Data Reading : ( excel / csv / json )
def read_data():
    file_path = input("Please enter File Path :")
    file_format = input("Please enter File Format (csv / excel / json) :").lower()
    if file_format == 'csv':
        return pd.read_csv(file_path)
    elif file_format == 'excel':
        return pd.read_excel(file_path)
    elif file_format == 'json':
        return pd.read_json(file_path)
    else:
        raise ValueError("File not available, please choose from 'csv', 'excel', or 'json'.")

In [7]:
data = read_data()

Please enter File Path : movies.csv
Please enter File Format (csv / excel / json) : csv


In [8]:
data

Unnamed: 0,Series_Title,Certificate,Runtime,Genre,IMDB_Rating,Meta_score,Director,No_of_Votes,Gross,Released_date,Langunges,budget,revenue,country
0,Her,A,126 min,"Drama, Romance, Sci-Fi",6.0,90,Spike Jonze,540772,"$25,568,251",1/16/2014,English,"$23,000,000.00",48043998,AU
1,300,A,117 min,"Action, Drama",7.6,52,Zack Snyder,732876,"$210,614,939",4/5/2007,English,60000000,454161935,AU
2,21 Grams,UA,124 min,"Crime, Drama, Thriller",7.6,70,Alejandro G. Iñárritu,224545,"$16,290,476",1/22/2004,English,20000000,59667625,AU
3,25Th Hour,R,135 min,Drama,7.6,68,Spike Lee,169708,"$13,060,843",6/5/2003,English,4500000,"$25,344,490.00",AU
4,50/50,R,100 min,"Comedy, Drama, Romance",7.6,72,Jonathan Levine,315426,"$35,014,192",3/8/2012,English,8000000,41334735,AU
5,A Hard Day'S Night,U,87 min,"Comedy, Music, Musical",7.6,96,Richard Lester,40351,"$13,780,024",7/7/1964,English,560000,1626784,GB
6,A Star Is Born,UA,136 min,"Drama, Music, Romance",7.6,88,Bradley Cooper,334312,"$215,288,866",10/18/2018,English,36000000,433888866,AU
7,American Psycho,A,101 min,"Comedy, Crime, Drama",7.6,64,Mary Harron,490062,"$15,070,285",4/13/2000,English,8000000,34270285,AU
8,Apollo 13,U,140 min,"Adventure, Drama, History",7.6,77,Ron Howard,269197,"$173,837,933",8/31/1995,English,65000000,335802271,AU
9,Baby Driver,UA,113 min,"Action, Crime, Drama",7.6,86,Edgar Wright,439406,"$107,825,862",8/10/2017,English,34000000,226977991,AU


In [9]:
# 2 Data Summary
def data_summary(data, target_column):
    #print information
    print("Information about all the columns and their situations and how many rows:\n")
    print(data.info())
    print("______________________________________________\n")
    #value & frequencies
    print("Values and their frequencies in the target column:\n")
    print(data[target_column].value_counts())
    print("______________________________________________\n")
    #Summary coloumn
    print("Summary statistics for the target column:\n")
    print(data[target_column].describe())
    print("______________________________________________\n")
    #repeated value
    print("The most repeated value in the target column:")
    mode_value = data[target_column].mode()
    if not mode_value.empty:
        print(mode_value.values[0])
    else:
        print("No mode exists.")
    print("______________________________________________\n")
    #Mean
    print("The mean of the target column:\n")
    print("Using Pandas:")
    print(data[target_column].mean())
    print("Using NumPy:")
    print(np.mean(data[target_column]))
    print("______________________________________________\n")
    #Max
    print("The highest value of the target column:")
    print(np.max(data[target_column]))
    print("______________________________________________\n")
    #Min
    print("The smallest value of the target column:")
    print(np.min(data[target_column]))
    print("______________________________________________\n")

In [10]:
data_summary(data, "Series_Title")

Information about all the columns and their situations and how many rows:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series_Title   23 non-null     object 
 1   Certificate    23 non-null     object 
 2   Runtime        23 non-null     object 
 3   Genre          23 non-null     object 
 4   IMDB_Rating    23 non-null     float64
 5   Meta_score     23 non-null     int64  
 6   Director       23 non-null     object 
 7   No_of_Votes    23 non-null     int64  
 8   Gross          23 non-null     object 
 9   Released_date  23 non-null     object 
 10  Langunges      23 non-null     object 
 11  budget         23 non-null     object 
 12  revenue        23 non-null     object 
 13  country        23 non-null     object 
dtypes: float64(1), int64(2), object(11)
memory usage: 2.6+ KB
None
______________________________________________

Va

TypeError: Could not convert string 'Her30021 Grams25Th Hour50/50A Hard Day'S NightA Star Is BornAmerican PsychoApollo 13Baby DriverBlood SimpleBlowBreakfast At Tiffany'SBridge Of SpiesClose Encounters Of The Third KindCreedDark CityDark WatersDawn Of The Planet Of The ApesDazed And ConfusedDelicatessenDespicable MeDie Hard: With A Vengeance' to numeric

In [15]:
# 3 Data handling missing value
def clear_missing(data):
    print("First, the rows with missing values:\n")
    empty_rows = data[data.isnull().any(axis=1)]
    print(empty_rows)
    x = input("Do you want to delete or fill the empty cells? (D/F): ").upper()

    if x == "D":
        data = data.dropna()
        return data
    elif x == "F":
        fill_value = input("Enter the value to fill the empty cells: ")
        data = data.fillna(fill_value)
        return data
    else:
        print("Invalid input. Please enter ('D' - delete) or ('F' - fill) missing values.")
        return None

In [16]:
#creating random "empty" values in the table
data["Empty"] = np.random.randint(0,2,len(data))
data["Empty"] = np.where(data["Empty"] == 0,None,1)

In [18]:
data["Empty"].value_counts()

Empty
1    11
Name: count, dtype: int64

In [19]:
#succedded to fill the place with number 1
clear_missing(data)

First, the rows with missing values:

          Series_Title Certificate  Runtime                      Genre  \
1                  300           A  117 min              Action, Drama   
3            25Th Hour           R  135 min                      Drama   
5   A Hard Day'S Night           U   87 min     Comedy, Music, Musical   
6       A Star Is Born          UA  136 min      Drama, Music, Romance   
7      American Psycho           A  101 min       Comedy, Crime, Drama   
9          Baby Driver          UA  113 min       Action, Crime, Drama   
11                Blow           R  124 min    Biography, Crime, Drama   
15               Creed           A  133 min               Drama, Sport   
16           Dark City           A  100 min  Mystery, Sci-Fi, Thriller   
19  Dazed And Confused           U  102 min                     Comedy   
20        Delicatessen           R   99 min              Comedy, Crime   
21       Despicable Me           U   95 min   Animation, Comedy, Crime   


Do you want to delete or fill the empty cells? (D/F):  F
Enter the value to fill the empty cells:  1


Unnamed: 0,Series_Title,Certificate,Runtime,Genre,IMDB_Rating,Meta_score,Director,No_of_Votes,Gross,Released_date,Langunges,budget,revenue,country,Empty
0,Her,A,126 min,"Drama, Romance, Sci-Fi",6.0,90,Spike Jonze,540772,"$25,568,251",1/16/2014,English,"$23,000,000.00",48043998,AU,1
1,300,A,117 min,"Action, Drama",7.6,52,Zack Snyder,732876,"$210,614,939",4/5/2007,English,60000000,454161935,AU,1
2,21 Grams,UA,124 min,"Crime, Drama, Thriller",7.6,70,Alejandro G. Iñárritu,224545,"$16,290,476",1/22/2004,English,20000000,59667625,AU,1
3,25Th Hour,R,135 min,Drama,7.6,68,Spike Lee,169708,"$13,060,843",6/5/2003,English,4500000,"$25,344,490.00",AU,1
4,50/50,R,100 min,"Comedy, Drama, Romance",7.6,72,Jonathan Levine,315426,"$35,014,192",3/8/2012,English,8000000,41334735,AU,1
5,A Hard Day'S Night,U,87 min,"Comedy, Music, Musical",7.6,96,Richard Lester,40351,"$13,780,024",7/7/1964,English,560000,1626784,GB,1
6,A Star Is Born,UA,136 min,"Drama, Music, Romance",7.6,88,Bradley Cooper,334312,"$215,288,866",10/18/2018,English,36000000,433888866,AU,1
7,American Psycho,A,101 min,"Comedy, Crime, Drama",7.6,64,Mary Harron,490062,"$15,070,285",4/13/2000,English,8000000,34270285,AU,1
8,Apollo 13,U,140 min,"Adventure, Drama, History",7.6,77,Ron Howard,269197,"$173,837,933",8/31/1995,English,65000000,335802271,AU,1
9,Baby Driver,UA,113 min,"Action, Crime, Drama",7.6,86,Edgar Wright,439406,"$107,825,862",8/10/2017,English,34000000,226977991,AU,1


In [20]:
#delete all the empty rows
clear_missing(data)

First, the rows with missing values:

          Series_Title Certificate  Runtime                      Genre  \
1                  300           A  117 min              Action, Drama   
3            25Th Hour           R  135 min                      Drama   
5   A Hard Day'S Night           U   87 min     Comedy, Music, Musical   
6       A Star Is Born          UA  136 min      Drama, Music, Romance   
7      American Psycho           A  101 min       Comedy, Crime, Drama   
9          Baby Driver          UA  113 min       Action, Crime, Drama   
11                Blow           R  124 min    Biography, Crime, Drama   
15               Creed           A  133 min               Drama, Sport   
16           Dark City           A  100 min  Mystery, Sci-Fi, Thriller   
19  Dazed And Confused           U  102 min                     Comedy   
20        Delicatessen           R   99 min              Comedy, Crime   
21       Despicable Me           U   95 min   Animation, Comedy, Crime   


Do you want to delete or fill the empty cells? (D/F):  D


Unnamed: 0,Series_Title,Certificate,Runtime,Genre,IMDB_Rating,Meta_score,Director,No_of_Votes,Gross,Released_date,Langunges,budget,revenue,country,Empty
0,Her,A,126 min,"Drama, Romance, Sci-Fi",6.0,90,Spike Jonze,540772,"$25,568,251",1/16/2014,English,"$23,000,000.00",48043998,AU,1
2,21 Grams,UA,124 min,"Crime, Drama, Thriller",7.6,70,Alejandro G. Iñárritu,224545,"$16,290,476",1/22/2004,English,20000000,59667625,AU,1
4,50/50,R,100 min,"Comedy, Drama, Romance",7.6,72,Jonathan Levine,315426,"$35,014,192",3/8/2012,English,8000000,41334735,AU,1
8,Apollo 13,U,140 min,"Adventure, Drama, History",7.6,77,Ron Howard,269197,"$173,837,933",8/31/1995,English,65000000,335802271,AU,1
10,Blood Simple,A,99 min,"Crime, Drama, Thriller",7.6,82,Joel Coen,87745,"$2,150,000",6/27/1985,English,1500000,2730877,AU,1
12,Breakfast At Tiffany'S,A,115 min,"Comedy, Drama, Romance",7.6,76,Blake Edwards,166544,"$90,485,605",2/8/1962,English,2500000,9500000,AU,1
13,Bridge Of Spies,UA,142 min,"Drama, History, Thriller",7.6,81,Steven Spielberg,287659,"$72,313,754",10/22/2015,English,40000000,162498338,AU,1
14,Close Encounters Of The Third Kind,U,138 min,"Drama, Sci-Fi",7.6,90,Steven Spielberg,184966,"$132,088,635",1/27/1978,English,"$20,000,000.00",340800479,AU,1
17,Dark Waters,PG-13,126 min,"Biography, Drama, History",7.6,73,Todd Haynes,60408,"$90,485,605",3/5/2020,English,835001.4,13601384,AU,1
18,Dawn Of The Planet Of The Apes,UA,130 min,"Action, Adventure, Drama",7.6,79,Matt Reeves,411599,"$208,545,589",7/9/2014,English,170000000,710644566,AU,1


In [21]:
# 4 Categorical Data Encoding
def describe_categorical_data(data, category_column):
    return data.groupby(category_column).describe()

def encode_categorical_data(data):
    return pd.get_dummies(data)

In [22]:
#last step ( encoding )
encode_categorical_data(data)

Unnamed: 0,IMDB_Rating,Meta_score,No_of_Votes,Series_Title_21 Grams,Series_Title_25Th Hour,Series_Title_300,Series_Title_50/50,Series_Title_A Hard Day'S Night,Series_Title_A Star Is Born,Series_Title_American Psycho,...,revenue_543464573,revenue_59667625,revenue_710644566,revenue_7961889,revenue_83282296,revenue_9500000,country_AU,country_FR,country_GB,Empty_1
0,6.0,90,540772,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
1,7.6,52,732876,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,7.6,70,224545,True,False,False,False,False,False,False,...,False,True,False,False,False,False,True,False,False,True
3,7.6,68,169708,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,7.6,72,315426,False,False,False,True,False,False,False,...,False,False,False,False,False,False,True,False,False,True
5,7.6,96,40351,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
6,7.6,88,334312,False,False,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
7,7.6,64,490062,False,False,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
8,7.6,77,269197,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
9,7.6,86,439406,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
