# Data Loading

In [None]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder


In [3]:
df = pd.read_csv("data.csv")
df

Unnamed: 0,rank,Video,Video views,Likes,Dislikes,Category,published
0,1,"20 Tennis shots if they were not filmed, NOBOD...",3471237,19023,859,,2017
1,2,Lil Nas X - Old Town Road (Official Movie) ft....,54071677,3497955,78799,Music,2019
2,3,JoJo Siwa - Karma (Official Video),34206747,293563,,Music,2024
3,4,Wiz Khalifa - See You Again ft. Charlie Puth [...,6643904918,44861602,,Music,2015
4,5,伊賀の天然水強炭酸水「家族で、シュワシェア。」篇　15秒,236085971,38,,,2021
...,...,...,...,...,...,...,...
995,996,CALIFORNIA - MONDAY JUSTICE FEAT. SNOOP DOGG (...,943248,2949,22,Music,2017
996,997,John McEnroe's most famous outburst happened i...,4853621,7254,426,Sports,2010
997,998,Adi Golan's Group - Teaser,56578,,,Music,2014
998,999,Keg tapping goes horribly wrong.,751804,3427,313,Entertainment,2017


# Data Exploring

In [4]:
print(f"Number of Rows: {df.shape[0]}\nNumber of Columns: {df.shape[1]}")

Number of Rows: 1000
Number of Columns: 7


In [5]:
df.head(2)

Unnamed: 0,rank,Video,Video views,Likes,Dislikes,Category,published
0,1,"20 Tennis shots if they were not filmed, NOBOD...",3471237,19023,859,,2017
1,2,Lil Nas X - Old Town Road (Official Movie) ft....,54071677,3497955,78799,Music,2019


In [6]:
df.tail(2)

Unnamed: 0,rank,Video,Video views,Likes,Dislikes,Category,published
998,999,Keg tapping goes horribly wrong.,751804,3427,313,Entertainment,2017
999,1000,LIVE Hurricane Irma - Downtown Orlando - Ameri...,554438,184,8,News & Politics,2016


In [7]:
df.describe()

Unnamed: 0,rank,published
count,1000.0,1000.0
mean,500.5,2015.933
std,288.819436,6.054847
min,1.0,2005.0
25%,250.75,2010.0
50%,500.5,2017.0
75%,750.25,2021.0
max,1000.0,2025.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   rank         1000 non-null   int64 
 1   Video        1000 non-null   object
 2   Video views  1000 non-null   object
 3   Likes        973 non-null    object
 4   Dislikes     687 non-null    object
 5   Category     820 non-null    object
 6   published    1000 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [9]:
df.columns

Index(['rank', 'Video', 'Video views', 'Likes', 'Dislikes', 'Category',
       'published'],
      dtype='object')

In [10]:
for i in df.columns:
    print(i)

rank
Video
Video views
Likes
Dislikes
Category
published


In [11]:
df.count()

rank           1000
Video          1000
Video views    1000
Likes           973
Dislikes        687
Category        820
published      1000
dtype: int64

In [12]:
df.nunique()

rank           1000
Video           994
Video views    1000
Likes           843
Dislikes        489
Category         15
published        21
dtype: int64

In [13]:
df['Category'].unique()

array([nan, 'Music', 'Entertainment', 'Sports', 'Comedy',
       'People & Blogs', 'Gaming', 'Pets & Animals', 'Autos & Vehicles',
       'Education', 'Film & Animation', 'News & Politics',
       'Howto & Style', 'Travel & Events', 'Nonprofits & Activism',
       'Science & Technology'], dtype=object)

In [14]:
df.isnull().sum()

rank             0
Video            0
Video views      0
Likes           27
Dislikes       313
Category       180
published        0
dtype: int64

# Data Pre-Prossing

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   rank         1000 non-null   int64 
 1   Video        1000 non-null   object
 2   Video views  1000 non-null   object
 3   Likes        973 non-null    object
 4   Dislikes     687 non-null    object
 5   Category     820 non-null    object
 6   published    1000 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [16]:
df.isnull().sum()

rank             0
Video            0
Video views      0
Likes           27
Dislikes       313
Category       180
published        0
dtype: int64

### Dealing with Null Values

In [19]:
def fillNaMean(cols):
    for i in cols:
        df[i] = df[i].fillna(df[i].mean())


In [20]:
def fillNaMode(cols):
    for i in cols:
        df[i] = df[i].fillna(df[i].mode()[0])

columns = [ 'Likes', 'Dislikes', 'Category']
fillNaMode(columns)

In [21]:
df.isnull().sum()

rank           0
Video          0
Video views    0
Likes          0
Dislikes       0
Category       0
published      0
dtype: int64

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   rank         1000 non-null   int64 
 1   Video        1000 non-null   object
 2   Video views  1000 non-null   object
 3   Likes        1000 non-null   object
 4   Dislikes     1000 non-null   object
 5   Category     1000 non-null   object
 6   published    1000 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


#  Dropping Irrelevant Columns

In [None]:
df.dropna(subset=['Category'], inplace=True) 

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   rank         1000 non-null   int64 
 1   Video        1000 non-null   object
 2   Video views  1000 non-null   object
 3   Likes        1000 non-null   object
 4   Dislikes     1000 non-null   object
 5   Category     1000 non-null   object
 6   published    1000 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [26]:
def changeFloattoInt64(cols):
    for i in cols:
        # Remove commas and convert to integers
        df[i] = df[i].str.replace(',', '').astype('int64')

columns = ['Likes', 'Dislikes']
changeFloattoInt64(columns)

In [29]:
df['published'].unique()

array([2017, 2019, 2024, 2015, 2021, 2023, 2018, 2007, 2020, 2011, 2012,
       2022, 2008, 2025, 2006, 2014, 2009, 2010, 2016, 2013, 2005])

In [30]:
df_clean = df.copy()

In [32]:
from sklearn.preprocessing import LabelEncoder

def encodeCols(cols):
    for i in cols:
        temp = pd.DataFrame({i:df[i].unique()})
        data_LE = LabelEncoder()
        data_LE.fit(np.ravel(temp))
        # data_LE.fit(temp)             # What is the difference between using "np.ravel" or without using "np.ravel"
        df_clean[i] = data_LE.transform(df[i])

columns = ['Category', 'published']	
encodeCols(columns)

In [33]:
df_clean.head()

Unnamed: 0,rank,Video,Video views,Likes,Dislikes,Category,published
0,1,"20 Tennis shots if they were not filmed, NOBOD...",3471237,19023,859,7,12
1,2,Lil Nas X - Old Town Road (Official Movie) ft....,54071677,3497955,78799,7,14
2,3,JoJo Siwa - Karma (Official Video),34206747,293563,0,7,19
3,4,Wiz Khalifa - See You Again ft. Charlie Puth [...,6643904918,44861602,0,7,10
4,5,伊賀の天然水強炭酸水「家族で、シュワシェア。」篇　15秒,236085971,38,0,7,16


In [34]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   rank         1000 non-null   int64 
 1   Video        1000 non-null   object
 2   Video views  1000 non-null   object
 3   Likes        1000 non-null   int64 
 4   Dislikes     1000 non-null   int64 
 5   Category     1000 non-null   int64 
 6   published    1000 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 54.8+ KB


In [35]:
df_clean.to_csv(r'cleaned-data.csv', index=False, header=True)

In [36]:
import numpy as np
df2 = pd.DataFrame({
    'A': [1,2,3,'?'],
    'B': [2,3,'?',4]
})

df2.replace("?", np.nan, inplace=True)

  df2.replace("?", np.nan, inplace=True)


In [37]:
print(df2)

     A    B
0  1.0  2.0
1  2.0  3.0
2  3.0  NaN
3  NaN  4.0
