In [1]:
# Import the numpy library for numerical operations
import numpy as np
# Import the pandas library for data manipulation and analysis
import pandas as pd

In [2]:
# Read the dataset from the specified URL into a pandas DataFrame
df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
# Print the shape (number of rows and columns) of the DataFrame
print(df.shape)
# Display the first 5 rows of the DataFrame to inspect the data
print(df.head())

In [3]:
# Print a concise summary of the DataFrame, including index dtype, column dtypes, non-null values, and memory usage
print(df.info())

In [8]:
# Randomly sample 3 records from the 'clean_comment' column and return their values as a numpy array
df.sample(3)['clean_comment'].values

In [15]:
# Calculate the number of missing (null) values in each column
df.isnull().sum()

In [16]:
# Filter and display rows where the 'clean_comment' column has missing values (NaN)
df[df['clean_comment'].isna()==True]

In [19]:
# Filter rows with missing 'clean_comment' and count the unique values in the 'category' column for these rows
df[df['clean_comment'].isna()]['category'].value_counts()

In [20]:
# Filter and display rows where the 'category' column has missing values (NaN)
df[df['category'].isna()]

In [23]:
# Drop rows with any missing values from the DataFrame in place
df.dropna(inplace=True)

In [24]:
# Check the shape of the DataFrame after dropping null values
df.shape

In [26]:
# Count the total number of duplicate rows in the DataFrame
df.duplicated().sum()

In [27]:
# Display the rows that are duplicates
df[df.duplicated()]

In [28]:
# Filter for duplicate rows and count the occurrences of each unique row among the duplicates
df[df.duplicated()].value_counts()

In [32]:
# Calculate the counts of all unique rows
counts = df.value_counts()
# Filter to keep only those rows that appear more than once (duplicates)
duplicates_with_counts = counts[counts > 1]
# Print the duplicates with their total counts
print(duplicates_with_counts)

In [33]:
# Remove duplicate rows from the DataFrame in place
df.drop_duplicates(inplace=True)

In [34]:
# Verify that there are no more duplicate rows
df.duplicated().sum()

In [36]:
# Filter and display rows where 'clean_comment' is an empty string after stripping whitespace
df[df['clean_comment'].str.strip()=='']

In [38]:
# Filter the DataFrame to keep only rows where 'clean_comment' is NOT an empty string after stripping whitespace
df = df[~(df['clean_comment'].str.strip()=='')]

In [None]:
# Import the regular expression module
import re

# Define an exhaustive regex pattern to match URLs (http, https, ftp, www, etc.)
url_pattern = r'(?i)\b((?:https?://|ftp://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'

# Check for rows containing URLs in the 'clean_comment' column
rows_with_urls = df[df['clean_comment'].str.contains(url_pattern, regex=True, na=False)]

# Print the number of rows with URLs
print(f"Number of rows with URLs: {len(rows_with_urls)}")
# Display the rows containing URLs
print(rows_with_urls)

In [None]:
# Replace new line characters ('\n') with a space in the 'clean_comment' column to preserve word separation
df['clean_comment'] = df['clean_comment'].str.replace(r'\n', ' ', regex=True)