Data Preprocessing

In [None]:
# Convert status_published to datetime
facebook_data['status_published'] = pd.to_datetime(facebook_data['status_published'], format='%m/%d/%Y %H:%M')

# Drop unnecessary columns
facebook_data.drop(columns=['Column1', 'Column2', 'Column3', 'Column4'], inplace=True)

# Handle missing values (if any)
facebook_data.dropna(inplace=True)

# Verify the preprocessing steps
facebook_data.info(), facebook_data.head()


Analyzing Effect of Time on Reactions

In [None]:
import matplotlib.pyplot as plt

# Extract hour from status_published
facebook_data['hour_published'] = facebook_data['status_published'].dt.hour

# Group by hour and calculate the mean number of reactions
hourly_reactions = facebook_data.groupby('hour_published')['num_reactions'].mean()

# Plot the mean number of reactions by hour
plt.figure(figsize=(12, 6))
hourly_reactions.plot(kind='bar')
plt.title('Average Number of Reactions by Hour of the Day')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Number of Reactions')
plt.xticks(rotation=0)
plt.show()


Correlation Analysis

In [None]:
# Calculate correlation matrix
correlation_matrix = facebook_data[['num_reactions', 'num_comments', 'num_shares']].corr()

# Display the correlation matrix
correlation_matrix


K-Means Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Select columns for clustering
clustering_data = facebook_data[['status_type', 'num_reactions', 'num_comments', 'num_shares',
                                 'num_likes', 'num_loves', 'num_wows', 'num_hahas', 'num_sads', 'num_angrys']]

# One-hot encode the status_type column
clustering_data = pd.get_dummies(clustering_data, columns=['status_type'])

# Standardize the data
scaler = StandardScaler()
clustering_data_scaled = scaler.fit_transform(clustering_data)

# Use the elbow method to find the optimal number of clusters
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(clustering_data_scaled)
    sse.append(kmeans.inertia_)

# Plot the elbow graph
plt.figure(figsize=(12, 6))
plt.plot(range(1, 11), sse, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Distances')
plt.show()


Count of Different Types of Posts

In [None]:
# Count the different types of posts
post_type_counts = facebook_data['status_type'].value_counts()

# Display the post type counts
post_type_counts


Average Engagement Metrics for Each Post Type

In [None]:
# Calculate the average engagement metrics for each post type
average_metrics = facebook_data.groupby('status_type')[['num_reactions', 'num_comments', 'num_shares']].mean()

# Display the average engagement metrics
average_metrics


We'll start with data preprocessing

In [None]:
import pandas as pd

# Load the dataset
file_path = '/mnt/data/Facebook_Marketplace_data.csv'
facebook_data = pd.read_csv(file_path)

# Display the first few rows of the dataset
facebook_data.head()


The dataset has been successfully preprocessed. Next, we will analyze how the time of upload (status_published) affects the number of reactions (num_reactions).

Analyzing Effect of Time on Reactions
Extract the hour from the status_published column.
Group by hour and calculate the mean number of reactions.
Plot the mean number of reactions by hour of the day.
Let's proceed with this analysis

In [None]:
import matplotlib.pyplot as plt

# Extract hour from status_published
facebook_data['hour_published'] = facebook_data['status_published'].dt.hour

# Group by hour and calculate the mean number of reactions
hourly_reactions = facebook_data.groupby('hour_published')['num_reactions'].mean()

# Plot the mean number of reactions by hour
plt.figure(figsize=(12, 6))
hourly_reactions.plot(kind='bar')
plt.title('Average Number of Reactions by Hour of the Day')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Number of Reactions')
plt.xticks(rotation=0)
plt.show()


Correlation Analysis

In [None]:
# Calculate correlation matrix
correlation_matrix = facebook_data[['num_reactions', 'num_comments', 'num_shares']].corr()

# Display the correlation matrix
correlation_matrix


Implementing these steps

In [None]:
# Data Preprocessing

# Convert status_published to datetime
facebook_data['status_published'] = pd.to_datetime(facebook_data['status_published'], format='%m/%d/%Y %H:%M')

# Drop unnecessary columns
facebook_data.drop(columns=['Column1', 'Column2', 'Column3', 'Column4'], inplace=True)

# Handle missing values (if any)
facebook_data.dropna(inplace=True)

# Verify the preprocessing steps
preprocessed_info = facebook_data.info()
preprocessed_head = facebook_data.head()

preprocessed_info, preprocessed_head



Correlation Analysis
Next, we will calculate the correlation between the number of reactions (num_reactions), comments (num_comments), and shares (num_shares).

In [None]:
# Calculate correlation matrix
correlation_matrix = facebook_data[['num_reactions', 'num_comments', 'num_shares']].corr()

# Display the correlation matrix
correlation_matrix


Let's proceed with this correlation analysis.

In [None]:
# Analyzing Effect of Time on Reactions

import matplotlib.pyplot as plt

# Extract hour from status_published
facebook_data['hour_published'] = facebook_data['status_published'].dt.hour

# Group by hour and calculate the mean number of reactions
hourly_reactions = facebook_data.groupby('hour_published')['num_reactions'].mean()

# Plot the mean number of reactions by hour
plt.figure(figsize=(12, 6))
hourly_reactions.plot(kind='bar')
plt.title('Average Number of Reactions by Hour of the Day')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Number of Reactions')
plt.xticks(rotation=0)
plt.show()


In [None]:
# Calculate correlation matrix
correlation_matrix = facebook_data[['num_reactions', 'num_comments', 'num_shares']].corr()

# Display the correlation matrix
print(correlation_matrix)
