In [1]:
import os

from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import font_manager
from collections import Counter
from pyvis.network import Network
import numpy as np

# <div style="font-family: 'Garamond', serif; font-size: 22px; color: #ffffff; background-color: #34568B; text-align: center; padding: 15px; border-radius: 10px; border: 2px solid #FF6F61; box-shadow: 0 6px 12px rgba(0, 0, 0, 0.3); margin-bottom: 20px;">Step 1: Load the data 載入資料</div>

# PTT AI 討論資料集介紹

## 資料集概覽

此資料集包含有關 AI 的討論，收集自台灣的社群媒體平台 PTT。資料收集範圍從 2022 年 1 月 1 日到 2024 年 5 月 31 日，共包含 2,172 筆資料。

## 資料欄位說明

資料集包含以下 11 個欄位：

1. `system_id`: 系統編號，總共 2172 筆，無空值。
2. `artComment`: 文章評論，總共 2172 筆，無空值。
3. `artDate`: 文章日期，總共 2172 筆，無空值。
4. `insertedDate`: 插入日期，總共 2172 筆，無空值。
5. `artContent`: 文章內容，總共 2172 筆，無空值。
6. `artCatagory`: 文章類別，總共 2172 筆，無空值。
7. `dataSource`: 資料來源，總共 2172 筆，無空值，全部來自 PTT。
8. `artPoster`: 發文者，總共 2172 筆，無空值。
9. `artTitle`: 文章標題，總共 2172 筆，無空值。
10. `artUrl`: 文章網址，總共 2172 筆，無空值。

## 記憶體使用情況

- 資料集大小： 186.8+ KB
- 資料型態：整數 (int64) 1 欄，物件 (object) 9 欄

In [2]:
df = pd.read_csv('data/ptt_ai_221130to240531.csv')
df.head()

Unnamed: 0,system_id,artUrl,artTitle,artDate,artPoster,artCatagory,artContent,artComment,e_ip,insertedDate,dataSource
0,1,https://www.ptt.cc/bbs/Finance/M.1644917373.A....,[徵才]普匯金融科技-金融科技行銷/金融實習生,2022-02-15 17:29:30,ggglu,Finance,【實習】\n【公司名稱】Influx FinTech 普匯金融科技\n\n【工作職缺】Fin...,"[{""cmtStatus"": ""噓"", ""cmtPoster"": ""cow38"", ""cmt...",118.168.142.101,2022-02-16 00:12:45,ptt
1,2,https://www.ptt.cc/bbs/Finance/M.1653071365.A....,[新聞]台銀「三駕馬車」戰略發威前4月,2022-05-21 02:29:23,make1302,Finance,（中央社記者張璦台北20日電）台灣銀行消金、企金、政府部門業務「三駕馬車」再發威\n，董事長...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""DerLuna"", ""c...",118.171.234.47,2022-05-22 01:11:41,ptt
2,3,https://www.ptt.cc/bbs/Finance/M.1676346325.A....,[新聞]央行徵才月薪最高86K,2023-02-14 11:45:19,kria5304,Finance,https://tinyurl.com/3fcp6ehh\n記者陳美君／台北報導\n2023...,"[{""cmtStatus"": ""噓"", ""cmtPoster"": ""fill725258"",...",36.224.200.45,2023-02-15 01:06:37,ptt
3,4,https://www.ptt.cc/bbs/Finance/M.1698684536.A....,[新聞]證交所徵才開創數位驅動時代報名至11/17,2023-10-31 00:48:54,BangBang5566,Finance,證交所徵才開創數位驅動時代 報名至11/17止\n\n中央社\n2023年10月30日 週一...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""Laviathan"", ...",42.70.143.5,2023-10-31 01:10:55,ptt
4,5,https://www.ptt.cc/bbs/job/M.1641165773.A.E67....,[台中][西屯］飛斯戴爾有限公司/日文客服,2022-01-03 07:22:51,howdigh,job,job版禁止張貼違反「\n就業服務法\n」、\n「\n性別平等工作法\n」、\n「\n勞基法...,[],125.230.247.67,2022-01-04 00:58:48,ptt


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2172 entries, 0 to 2171
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   system_id     2172 non-null   int64 
 1   artUrl        2172 non-null   object
 2   artTitle      2172 non-null   object
 3   artDate       2172 non-null   object
 4   artPoster     2172 non-null   object
 5   artCatagory   2172 non-null   object
 6   artContent    2172 non-null   object
 7   artComment    2172 non-null   object
 8   e_ip          2126 non-null   object
 9   insertedDate  2172 non-null   object
 10  dataSource    2172 non-null   object
dtypes: int64(1), object(10)
memory usage: 186.8+ KB


## <div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Load the font 載入字型</div>

In [4]:
# Path to the custom font
font_path = 'font/TraditionalChinese.ttf'

# Add the custom font to the font manager
font_manager.fontManager.addfont(font_path)

# After adding the font, search for it by filename to get the correct font name
for font in font_manager.fontManager.ttflist:
    if font.fname == font_path:
        print(f"Found font: {font.name}")
        plt.rcParams['font.family'] = font.name
        break

Found font: Noto Sans TC


# <div style="font-family: 'Garamond', serif; font-size: 22px; color: #ffffff; background-color: #34568B; text-align: center; padding: 15px; border-radius: 10px; border: 2px solid #FF6F61; box-shadow: 0 6px 12px rgba(0, 0, 0, 0.3); margin-bottom: 20px;">Step 2: Data Preprocessing 資料前處理</div>

## <div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Data Cleaning 資料清理</div>

In [5]:
# 複製原始資料集
clear_df = df.copy()

#去除本次不會用到的欄位
drop_cols = ['system_id', 'artTitle', 'artCatagory', 'dataSource', 'insertedDate']
clear_df.drop(drop_cols, axis = 1, inplace = True)

#去除文章內容為空值的筆數
clear_df.dropna(subset = ['artContent'], axis=0, how='any', inplace=True)

#新增['sentence']欄位，用'。'取代'\n\n'，並移除'\n'
clear_df['sentence'] = clear_df['artContent'].str.replace(r'\n\n','。', regex=True)
clear_df['sentence'] = clear_df['sentence'].str.replace(r'\n','，', regex=True)

#移除內文中的網址
clear_df['sentence'] = clear_df['sentence'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

clear_df.head(10)

Unnamed: 0,artUrl,artDate,artPoster,artContent,artComment,e_ip,sentence
0,https://www.ptt.cc/bbs/Finance/M.1644917373.A....,2022-02-15 17:29:30,ggglu,【實習】\n【公司名稱】Influx FinTech 普匯金融科技\n\n【工作職缺】Fin...,"[{""cmtStatus"": ""噓"", ""cmtPoster"": ""cow38"", ""cmt...",118.168.142.101,【實習】，【公司名稱】Influx FinTech 普匯金融科技。【工作職缺】Fintech...
1,https://www.ptt.cc/bbs/Finance/M.1653071365.A....,2022-05-21 02:29:23,make1302,（中央社記者張璦台北20日電）台灣銀行消金、企金、政府部門業務「三駕馬車」再發威\n，董事長...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""DerLuna"", ""c...",118.171.234.47,（中央社記者張璦台北20日電）台灣銀行消金、企金、政府部門業務「三駕馬車」再發威，，董事長呂...
2,https://www.ptt.cc/bbs/Finance/M.1676346325.A....,2023-02-14 11:45:19,kria5304,https://tinyurl.com/3fcp6ehh\n記者陳美君／台北報導\n2023...,"[{""cmtStatus"": ""噓"", ""cmtPoster"": ""fill725258"",...",36.224.200.45,週二 上午6:32，擁抱「金」飯碗的機會來了。中央銀行近日公開徵才，開出的職缺包括「經濟金...
3,https://www.ptt.cc/bbs/Finance/M.1698684536.A....,2023-10-31 00:48:54,BangBang5566,證交所徵才開創數位驅動時代 報名至11/17止\n\n中央社\n2023年10月30日 週一...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""Laviathan"", ...",42.70.143.5,證交所徵才開創數位驅動時代 報名至11/17止。中央社，2023年10月30日 週一 下午1...
4,https://www.ptt.cc/bbs/job/M.1641165773.A.E67....,2022-01-03 07:22:51,howdigh,job版禁止張貼違反「\n就業服務法\n」、\n「\n性別平等工作法\n」、\n「\n勞基法...,[],125.230.247.67,job版禁止張貼違反「，就業服務法，」、，「，性別平等工作法，」、，「，勞基法，」與其他法律...
5,https://www.ptt.cc/bbs/job/M.1641369547.A.B96....,2022-01-05 15:59:05,cakelover,job版禁止張貼違反「\n就業服務法\n」、\n「\n性別平等工作法\n」、\n「\n勞基法...,[],1.160.17.133,job版禁止張貼違反「，就業服務法，」、，「，性別平等工作法，」、，「，勞基法，」與其他法律...
6,https://www.ptt.cc/bbs/job/M.1642053918.A.308....,2022-01-13 14:05:16,qhorohoro,job版禁止張貼違反「\n就業服務法\n」、\n「\n性別平等工作法\n」、\n「\n勞基法...,[],61.220.52.4,job版禁止張貼違反「，就業服務法，」、，「，性別平等工作法，」、，「，勞基法，」與其他法律...
7,https://www.ptt.cc/bbs/job/M.1642382909.A.5D7....,2022-01-17 09:28:27,catty0310,job版禁止張貼違反「\n就業服務法\n」、\n「\n性別平等工作法\n」、\n「\n勞基法...,[],1.162.212.73,job版禁止張貼違反「，就業服務法，」、，「，性別平等工作法，」、，「，勞基法，」與其他法律...
8,https://www.ptt.cc/bbs/job/M.1642668761.A.F44....,2022-01-20 16:52:39,trpw,job版禁止張貼違反「\n就業服務法\n」、\n「\n性別平等工作法\n」、\n「\n勞基法...,[],219.87.87.33,job版禁止張貼違反「，就業服務法，」、，「，性別平等工作法，」、，「，勞基法，」與其他法律...
9,https://www.ptt.cc/bbs/job/M.1642761016.A.4BA....,2022-01-21 18:30:13,GhostGrace,job版禁止張貼違反「\n就業服務法\n」、\n「\n性別平等工作法\n」、\n「\n勞基法...,[],223.137.115.117,job版禁止張貼違反「，就業服務法，」、，「，性別平等工作法，」、，「，勞基法，」與其他法律...


## <div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Deal with Comments 處理留言</div>

In [6]:
tqdm.pandas()

# 處理某篇文章的所有留言（取出留言者與狀態）
def get_comment_info(com):
  commenters,comment_status = [],[]
  com = eval(com)
  for i in com:
    commenters.append(i['cmtPoster'])
    comment_status.append(i['cmtStatus'])
  return pd.Series([commenters, comment_status])

# 將留言者與狀態分開
clear_df[['commenters', 'comment_status']] = clear_df['artComment'].apply(get_comment_info)
clear_df.head()

Unnamed: 0,artUrl,artDate,artPoster,artContent,artComment,e_ip,sentence,commenters,comment_status
0,https://www.ptt.cc/bbs/Finance/M.1644917373.A....,2022-02-15 17:29:30,ggglu,【實習】\n【公司名稱】Influx FinTech 普匯金融科技\n\n【工作職缺】Fin...,"[{""cmtStatus"": ""噓"", ""cmtPoster"": ""cow38"", ""cmt...",118.168.142.101,【實習】，【公司名稱】Influx FinTech 普匯金融科技。【工作職缺】Fintech...,"[cow38, IKnowWhy, z6112539, blackfire]","[噓, 噓, 噓, 噓]"
1,https://www.ptt.cc/bbs/Finance/M.1653071365.A....,2022-05-21 02:29:23,make1302,（中央社記者張璦台北20日電）台灣銀行消金、企金、政府部門業務「三駕馬車」再發威\n，董事長...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""DerLuna"", ""c...",118.171.234.47,（中央社記者張璦台北20日電）台灣銀行消金、企金、政府部門業務「三駕馬車」再發威，，董事長呂...,"[DerLuna, TainanBus, whocare96, wekl, poisonB,...","[推, 推, 推, 推, 推, 噓, 推, 推, →, 推, →, 推, 噓, 推, →, ..."
2,https://www.ptt.cc/bbs/Finance/M.1676346325.A....,2023-02-14 11:45:19,kria5304,https://tinyurl.com/3fcp6ehh\n記者陳美君／台北報導\n2023...,"[{""cmtStatus"": ""噓"", ""cmtPoster"": ""fill725258"",...",36.224.200.45,週二 上午6:32，擁抱「金」飯碗的機會來了。中央銀行近日公開徵才，開出的職缺包括「經濟金...,"[fill725258, junkuo, lolicum, Kydland, BernieW...","[噓, 推, 推, 推, 推, 推, →, 推, 推]"
3,https://www.ptt.cc/bbs/Finance/M.1698684536.A....,2023-10-31 00:48:54,BangBang5566,證交所徵才開創數位驅動時代 報名至11/17止\n\n中央社\n2023年10月30日 週一...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""Laviathan"", ...",42.70.143.5,證交所徵才開創數位驅動時代 報名至11/17止。中央社，2023年10月30日 週一 下午1...,"[Laviathan, cuteman0725, flybydance, Laviathan...","[推, 推, 推, 推, 推, 推, 推, →, 推, 噓, 推, 推, 推, →, 推, ..."
4,https://www.ptt.cc/bbs/job/M.1641165773.A.E67....,2022-01-03 07:22:51,howdigh,job版禁止張貼違反「\n就業服務法\n」、\n「\n性別平等工作法\n」、\n「\n勞基法...,[],125.230.247.67,job版禁止張貼違反「，就業服務法，」、，「，性別平等工作法，」、，「，勞基法，」與其他法律...,[],[]


## <div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Explode the DataFrame 展開資料</div>

In [7]:
# 展開留言者與狀態
clear_df = clear_df.explode(['commenters', 'comment_status'])

social_df = clear_df[['artPoster','artUrl','commenters','comment_status']]

social_df.head()

Unnamed: 0,artPoster,artUrl,commenters,comment_status
0,ggglu,https://www.ptt.cc/bbs/Finance/M.1644917373.A....,cow38,噓
0,ggglu,https://www.ptt.cc/bbs/Finance/M.1644917373.A....,IKnowWhy,噓
0,ggglu,https://www.ptt.cc/bbs/Finance/M.1644917373.A....,z6112539,噓
0,ggglu,https://www.ptt.cc/bbs/Finance/M.1644917373.A....,blackfire,噓
1,make1302,https://www.ptt.cc/bbs/Finance/M.1653071365.A....,DerLuna,推


In [8]:
social_df['comment_status'].value_counts()

comment_status
推    63199
→    53625
噓    10843
Name: count, dtype: int64

# <div style="font-family: 'Garamond', serif; font-size: 22px; color: #ffffff; background-color: #34568B; text-align: center; padding: 15px; border-radius: 10px; border: 2px solid #FF6F61; box-shadow: 0 6px 12px rgba(0, 0, 0, 0.3); margin-bottom: 20px;">Step 3: Network Analysis 網路分析</div>

## <div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Filter the Data 篩選資料</div>

In [9]:
# Count the frequency of each user in the comments and posts
user_count = Counter(social_df['artPoster'].tolist() + social_df['commenters'].tolist())

# Get the top 30 most active users
top_users = {user for user, count in user_count.most_common(30)}

# Filter edges to include only interactions between top users
top_filtered_df = social_df[social_df['artPoster'].isin(top_users) & social_df['commenters'].isin(top_users)]

top_filtered_df.head()

Unnamed: 0,artPoster,artUrl,commenters,comment_status
166,keel90135,https://www.ptt.cc/bbs/Soft_Job/M.1659257932.A...,DrTech,→
166,keel90135,https://www.ptt.cc/bbs/Soft_Job/M.1659257932.A...,DrTech,→
166,keel90135,https://www.ptt.cc/bbs/Soft_Job/M.1659257932.A...,DrTech,→
166,keel90135,https://www.ptt.cc/bbs/Soft_Job/M.1659257932.A...,DrTech,→
166,keel90135,https://www.ptt.cc/bbs/Soft_Job/M.1659257932.A...,DrTech,→


## <div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Transform the Data 轉換資料</div>

In [10]:
# 留言者對文章，狀態為 weight
re_df = top_filtered_df[['commenters','artUrl','comment_status']].rename(columns = {'commenters':'src','artUrl':'dis','comment_status':'weight'})
re_df = re_df[~re_df['src'].isna()]
re_df.head()

Unnamed: 0,src,dis,weight
166,DrTech,https://www.ptt.cc/bbs/Soft_Job/M.1659257932.A...,→
166,DrTech,https://www.ptt.cc/bbs/Soft_Job/M.1659257932.A...,→
166,DrTech,https://www.ptt.cc/bbs/Soft_Job/M.1659257932.A...,→
166,DrTech,https://www.ptt.cc/bbs/Soft_Job/M.1659257932.A...,→
166,DrTech,https://www.ptt.cc/bbs/Soft_Job/M.1659257932.A...,→


In [11]:
def convert_status(s):
  if s == '推':
    return 2
  elif s == '→':
    return 1
  else :
    return -1
  
re_df['weight'] = re_df['weight'].map(convert_status)

# 計算某位留言者對某篇文章的總分數
re_df = re_df.groupby(['src','dis']).sum().reset_index()
re_df

Unnamed: 0,src,dis,weight
0,Cartier,https://www.ptt.cc/bbs/Stock/M.1689924118.A.D9...,2
1,DamnDre,https://www.ptt.cc/bbs/Stock/M.1705162613.A.94...,2
2,DamnDre,https://www.ptt.cc/bbs/Stock/M.1705765005.A.5C...,-1
3,DrTech,https://www.ptt.cc/bbs/Soft_Job/M.1659257932.A...,5
4,DrTech,https://www.ptt.cc/bbs/Soft_Job/M.1686219286.A...,11
...,...,...,...
272,xephon,https://www.ptt.cc/bbs/Stock/M.1714179479.A.DD...,1
273,xephon,https://www.ptt.cc/bbs/Stock/M.1716983131.A.BD...,3
274,yakimochi,https://www.ptt.cc/bbs/Stock/M.1690883012.A.16...,2
275,yakimochi,https://www.ptt.cc/bbs/Stock/M.1695649551.A.97...,1


## <div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Create the Network for users and articles 建立使用者與文章的網路</div>

In [12]:
# 設定分數低的為紅色
def get_color(w):
  if w>0:
    return 'green'
  else:
    return 'red'
  
re_df['color'] = re_df.weight.map(get_color)
re_df

Unnamed: 0,src,dis,weight,color
0,Cartier,https://www.ptt.cc/bbs/Stock/M.1689924118.A.D9...,2,green
1,DamnDre,https://www.ptt.cc/bbs/Stock/M.1705162613.A.94...,2,green
2,DamnDre,https://www.ptt.cc/bbs/Stock/M.1705765005.A.5C...,-1,red
3,DrTech,https://www.ptt.cc/bbs/Soft_Job/M.1659257932.A...,5,green
4,DrTech,https://www.ptt.cc/bbs/Soft_Job/M.1686219286.A...,11,green
...,...,...,...,...
272,xephon,https://www.ptt.cc/bbs/Stock/M.1714179479.A.DD...,1,green
273,xephon,https://www.ptt.cc/bbs/Stock/M.1716983131.A.BD...,3,green
274,yakimochi,https://www.ptt.cc/bbs/Stock/M.1690883012.A.16...,2,green
275,yakimochi,https://www.ptt.cc/bbs/Stock/M.1695649551.A.97...,1,green


In [13]:
# 發文者對文章
po_df = top_filtered_df[['artPoster', 'artUrl']].rename(columns={'artPoster': 'src', 'artUrl': 'dis'}).drop_duplicates()

In [14]:
# 人為綠色節點 po文為橘色節點

# 建立一個網路圖
netWork = Network(notebook=True, cdn_resources='in_line', directed=True)

# 所有發文者＋留言者
person = list(set(po_df.src.unique().tolist() + re_df.src.unique().tolist()))
url = po_df.dis.unique().tolist()

# 加入節點（人）
netWork.add_nodes(
    nodes=person,
    value=[1 for i in range(len(person))],
    color=['#66CDAA' for i in range(len(person))],
    title=person
)

# 加入節點（文章）
netWork.add_nodes(
    nodes=url,
    value=[2 for i in range(len(url))],
    color=['#FFB366' for i in range(len(url))],
    title=url
)

# 加入邊（發文者 -> 文章）
for i in po_df.to_numpy():
    netWork.add_edge(i[0], i[1], width=2, color='grey')
# 加入邊（留言者 -> 文章），顏色為某發文者對該文章的總分（>0:綠; <=0:紅）
for i in re_df.to_numpy():
    netWork.add_edge(i[0], i[1], width=2, color=i[3])

# 設定layout，圖節點之間的斥力
netWork.repulsion()

if not os.path.exists('plot'):
    os.makedirs('plot')

netWork.show('plot/Network.html')

plot/Network.html


## <div style="font-family: 'Lucida Sans Unicode', sans-serif; font-size: 18px; color: #4A235A; background-color: #D7BDE2; text-align: left; padding: 10px; border-left: 5px solid #7D3C98; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2); margin-bottom: 10px;">Relationship Between Users 網友關係</div>

In [15]:
pos_comment = social_df.copy()
pos_comment = pos_comment[~pos_comment['commenters'].isna()]
pos_comment.head(10)

Unnamed: 0,artPoster,artUrl,commenters,comment_status
0,ggglu,https://www.ptt.cc/bbs/Finance/M.1644917373.A....,cow38,噓
0,ggglu,https://www.ptt.cc/bbs/Finance/M.1644917373.A....,IKnowWhy,噓
0,ggglu,https://www.ptt.cc/bbs/Finance/M.1644917373.A....,z6112539,噓
0,ggglu,https://www.ptt.cc/bbs/Finance/M.1644917373.A....,blackfire,噓
1,make1302,https://www.ptt.cc/bbs/Finance/M.1653071365.A....,DerLuna,推
1,make1302,https://www.ptt.cc/bbs/Finance/M.1653071365.A....,TainanBus,推
1,make1302,https://www.ptt.cc/bbs/Finance/M.1653071365.A....,whocare96,推
1,make1302,https://www.ptt.cc/bbs/Finance/M.1653071365.A....,wekl,推
1,make1302,https://www.ptt.cc/bbs/Finance/M.1653071365.A....,poisonB,推
1,make1302,https://www.ptt.cc/bbs/Finance/M.1653071365.A....,ridiculousYA,噓


In [16]:
pos_comment['score'] = pos_comment['comment_status'].apply(lambda r: convert_status(r))

# 某留言者對某發文者的總分
pos_comment = pos_comment.groupby(['commenters', 'artPoster']).sum('score').reset_index()
pos_comment = pos_comment[pos_comment.score>=0]
pos_comment = pos_comment[pos_comment['commenters'] != pos_comment.artPoster]

pos_comment

Unnamed: 0,commenters,artPoster,score
0,A10,pl132,3
1,A10,ynlin1996,2
2,A10,zzzz8931,1
3,A1pha,Angels5566,4
4,A1pha,DrowningPool,4
...,...,...,...
77163,zzzzzzzzzzzy,x5723,2
77164,zzzzzzzzzzzy,xephon,2
77165,zzzzzzzzzzzy,yakimochi,3
77166,zzzzzzzzzzzy,yogo3388,4


In [17]:
matrix = pd.pivot_table(pos_comment,index = 'commenters', columns = 'artPoster' ,values='score').fillna(0)
matrix.head()

artPoster,AAAB,ARKUE,ARRSNASA,AUTIS,Abre,AgentSkye56,AgileSeptor,AlainDelon,Alison5566,AllBlack,...,z520314,zakijudelo,zetacat,zhongrong,zuzinajp,zxc8787,zxcvxx,zxxxxxxxg,zzahoward,zzzz8931
commenters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
A1pha,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A22813079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A791027A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A80211ab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# 所有留言者與發文者
pers = np.unique(pos_comment[['commenters', 'artPoster']])

# 建立評分矩陣（留言者對發文者分數）
# 取得所有人對於其他人的分數（有方向性）
# 矩陣中的值 mat[i][j] 代表使用者 pers[i]（留言者）對使用者 pers[j]（發文者）的評分，
# 留言者和發文者之間的關係是有方向的（即，mat[i][j] 不一定等於 mat[j][i]）
matrix = pd.pivot_table(pos_comment,index = 'commenters', columns = 'artPoster' ,values='score' ).fillna(0)\
  .reindex(columns=pers, index=pers, fill_value=0).to_numpy()
matrix.shape

(17482, 17482)

In [19]:
# 取得所有人與其他人互動分數總和（無方向，兩個方向分數相加）
# 矩陣中的每一對元素都對稱，即 mat_s[i][j] 等於 mat_s[j][i]
# 對稱矩陣中的值 mat_s[i][j] 表示使用者 pers[i] 與使用者 pers[j] 之間的互動總分。
# 透過將 pers[i] 對 pers[j] 的評分和 pers[j] 對 pers[i] 的評分相加而得到的。
# np.tril(mat, -1): 生成一個下三角矩陣（包括主對角線下方的所有元素，但不包括主對角線上的元素）
# np.triu(mat, 1): 生成一個上三角矩陣（包括主對角線上方的所有元素，但不包括主對角線上的元素）
tri = (np.tril(matrix,-1).T + np.triu(matrix,1))
matrix_sum = tri+tri.T # 上三角和下三角都包括在內，但對角線是 0
matrix_sum

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])