In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# ***`Libraries`***

In [33]:
import pandas as pd
import re
import json
import ast
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx


# ***`Read Data`***

In [111]:
file_path = '/content/drive/My Drive/Colab Notebooks/SocialNetworkingCourse/Project VNexpress/vnexpress_articles_3.xlsx'
df = pd.read_excel(file_path)

In [112]:
def clean_date(date_str):
    # Lấy phần ngày tháng năm (loại bỏ các ký tự không cần thiết)
    date_part = date_str.split(",")[1].strip()
    # Chuyển đổi chuỗi thành định dạng datetime
    date_obj = pd.to_datetime(date_part, format='%d/%m/%Y')
    # Định dạng lại chuỗi ngày
    return date_obj.strftime('%A, %d/%m/%Y')
df['Formatted_Date'] = df['Date'].apply(clean_date)


In [113]:
df = df.drop(columns=['Date', 'Detailed Title', 'Comments Count', 'Total_Comments', 'Content', 'Link'])
df.head(5)

Unnamed: 0.1,Unnamed: 0,Title,Author,Author Link,Category,Author's Position,Comments_Post,Formatted_Date
0,0,Miễn học phí ngành Y?,Nguyễn Minh Hoàng,https://vnexpress.net/tac-gia/nguyen-minh-hoan...,Giáo dục & tri thức,Nhà công tác xã hội và phát triển cộng đồng,"[{'Nickname': 'Khánh Super', 'Nickname Link': ...","Tuesday, 31/12/2024"
1,1,Đến Việt Nam chữa bệnh,Nguyễn Hồng Hà,https://vnexpress.net/tac-gia/nguyen-hong-ha-1...,Y tế & sức khỏe,Bác sĩ phẫu thuật,"[{'Nickname': 'hanhantramchuong', 'Nickname Li...","Monday, 30/12/2024"
2,2,'Ăn cắp' thời công nghệ,Quan Thế Dân,https://vnexpress.net/tac-gia/quan-the-dan-144...,Văn hóa & lối sống,"Bác sĩ, Tiến sĩ Y học","[{'Nickname': 'thecong85', 'Nickname Link': 'h...","Sunday, 29/12/2024"
3,3,'Não cá vàng' vui vẻ,Jesse Peterson,https://vnexpress.net/tac-gia/jesse-peterson-1...,Văn hóa & lối sống,Tác giả sách,"[{'Nickname': 'Hung Tran Viet', 'Nickname Link...","Saturday, 28/12/2024"
4,4,"Tinh giản giấy tờ, đơn xin",Bùi Mẫn,https://vnexpress.net/tac-gia/bui-man-1438.html,Chính trị & chính sách,Kỹ sư cao cấp,"[{'Nickname': 'Phúc Nguyên', 'Nickname Link': ...","Friday, 27/12/2024"


In [114]:
# Function to process Comments_Post
def extract_comments_info(comments):
    result = []
    # Check if 'comments' is a string and attempt to convert to a list of dictionaries
    if isinstance(comments, str):
        try:
            comments = ast.literal_eval(comments)  # Safely evaluate the string
        except (SyntaxError, ValueError):
            # Handle cases where conversion fails (e.g., invalid JSON)
            return result  # Return empty result if conversion fails

    # Now proceed with the original logic if 'comments' is a list of dictionaries
    for comment in comments:
        if isinstance(comment, dict):  # Ensure comment is a dictionary
            result.append({
                "Nickname": comment.get("Nickname", ""),
                "Nickname Link": comment.get("Nickname Link", ""),
                "Reply Nicknames": comment.get("Reply Nicknames", []),
                "Reply Links": comment.get("Reply Links", [])
            })
    return result

# Apply extraction to the Comments_Post column
df['Processed_Comments'] = df['Comments_Post'].apply(extract_comments_info)

# Drop the original Comments_Post column for cleaner display
df.drop(columns=['Comments_Post'], inplace=True)

In [115]:
# Extract Author ID from Author Link (keep full ID including numbers)
df["Author Link"] = df["Author Link"].str.extract(r".*/(.*)\.html")

# Extract Processed_Comments into structured form
def process_comments(comments):
    processed = []
    for comment in comments:
        nickname_link = comment.get("Nickname Link", "")
        reply_links = comment.get("Reply Links", [])
        # Extract IDs
        nickname_id = nickname_link.split("/")[-1] if nickname_link else ""
        reply_ids = [link.split("/")[-1] for link in reply_links]
        processed.append({
            "Nickname ID": nickname_id,
            "Reply IDs": reply_ids,
        })
    return processed

df["Processed_Comments"] = df["Processed_Comments"].apply(process_comments)


df = df.drop(columns = ['Title', 'Author', "Author's Position", "Formatted_Date"])
# Display the result
print(df)

   Unnamed: 0             Author Link                Category  \
0           0  nguyen-minh-hoang-1955     Giáo dục & tri thức   
1           1     nguyen-hong-ha-1279         Y tế & sức khỏe   
2           2       quan-the-dan-1447      Văn hóa & lối sống   
3           3     jesse-peterson-1050      Văn hóa & lối sống   
4           4            bui-man-1438  Chính trị & chính sách   
5           5       vo-nhat-vinh-1166   Kinh doanh & quản trị   
6           6     dang-thai-hoang-433   Kinh doanh & quản trị   
7           7  trinh-phuong-quan-1404   Kinh doanh & quản trị   
8           8            bui-man-1438   Kinh doanh & quản trị   
9           9    ngo-trong-thanh-1006      Văn hóa & lối sống   

                                  Processed_Comments  
0  [{'Nickname ID': '1026010466', 'Reply IDs': ['...  
1  [{'Nickname ID': '1051012051', 'Reply IDs': ['...  
2  [{'Nickname ID': '1059731000', 'Reply IDs': ['...  
3  [{'Nickname ID': '1014057665', 'Reply IDs': ['...  
4  [{'Nic

In [116]:
# Function to parse comments
def parse_comments(data):
    try:
        return eval(data)  # Directly parse as list
    except Exception as e:
        return []

# Build directed graph
G = nx.DiGraph()
for _, row in df.iterrows():
    author = row["Author Link"]
    category = row["Category"]
    comments = parse_comments(row["Processed_Comments"])

    G.add_node(author, type="Author Link", category=category)

    for comment in comments:
        nickname = comment.get("Nickname ID")
        if nickname:
            G.add_node(nickname, type="Commenter", category=category)
            G.add_edge(author, nickname)

            reply_ids = comment.get("Reply IDs", [])
            for reply_id in reply_ids:
                G.add_node(reply_id, type="Replier", category=category)
                G.add_edge(nickname, reply_id)

# Calculate basic metrics
num_nodes = len(G.nodes)
num_edges = len(G.edges)
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)

# Compile metrics into a DataFrame
centrality_df = pd.DataFrame({
    "Node": list(G.nodes),
    "Type": [G.nodes[node]['type'] for node in G.nodes],
    "Category": [G.nodes[node]['category'] for node in G.nodes],
    "Number of Nodes": num_nodes,
    "Number of Edges": num_edges,
    "Degree Centrality": list(degree_centrality.values()),
    "Betweenness Centrality": list(betweenness_centrality.values()),
    "Closeness Centrality": list(closeness_centrality.values()),
})


In [117]:
print(centrality_df)

                     Node         Type               Category  \
0  nguyen-minh-hoang-1955  Author Link    Giáo dục & tri thức   
1     nguyen-hong-ha-1279  Author Link        Y tế & sức khỏe   
2       quan-the-dan-1447  Author Link     Văn hóa & lối sống   
3     jesse-peterson-1050  Author Link     Văn hóa & lối sống   
4            bui-man-1438  Author Link  Kinh doanh & quản trị   
5       vo-nhat-vinh-1166  Author Link  Kinh doanh & quản trị   
6     dang-thai-hoang-433  Author Link  Kinh doanh & quản trị   
7  trinh-phuong-quan-1404  Author Link  Kinh doanh & quản trị   
8    ngo-trong-thanh-1006  Author Link     Văn hóa & lối sống   

   Number of Nodes  Number of Edges  Degree Centrality  \
0                9                0                0.0   
1                9                0                0.0   
2                9                0                0.0   
3                9                0                0.0   
4                9                0                0.0   
5

In [118]:
# Export the graph to a file in GraphML format
output_file = "/content/drive/My Drive/Colab Notebooks/SocialNetworkingCourse/Project VNexpress/social_network.graphml"
nx.write_graphml(G, output_file)

output_file


'/content/drive/My Drive/Colab Notebooks/SocialNetworkingCourse/Project VNexpress/social_network.graphml'