# 資料前處理 & 情緒分析


套件設定

In [23]:
import jieba
import jieba.analyse
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from collections import Counter
from snownlp import SnowNLP
%matplotlib inline

## MetaData
- 資料來源 : PTT 汽車版、汽車買賣版
- 資料區間：2020/12/01 ~ 2023/01/31

| 廠牌 | 關鍵字 | 資料總筆數 |
| --- | --- | --- |
| Nissan | Nissan、裕隆、裕日,日產、Sentra、Kicks、仙草 | 2,464 |
| Toyota | Toyota、Altis、Cross、豐田、和泰、阿提斯、卡羅拉 | 8,755 |
| Ford | 福特、六和、九和、上正、Ford、Focus | 4,684 |
| Honda | Honda、HRV、本田 | 2,488 |
| Mazda | Mazda、CX-3、CX-30、馬三、Mazda 3 | 3,321 |

## 資料載入
根據自己的data路徑修改 <br>
`data不上github`，要注意ignore

In [105]:
ptt = pd.read_csv("../data/rawData/mazda_ptt_data.csv") 
ptt.head(3)

Unnamed: 0,system_id,artUrl,artTitle,artDate,artPoster,artCatagory,artContent,artComment,e_ip,insertedDate,dataSource
0,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,[新聞]小休旅熱鬧好玩PEUGEOT30081.5LBlueH,2020-12-01 00:09:42,city0504,car,原文連結：\nhttps://ctee.com.tw/lohas/car/378518.ht...,"[{""cmtStatus"": ""→"", ""cmtPoster"": ""mingchaoliu""...",111.243.121.95,2020-12-01 00:04:07,ptt
1,2,https://www.ptt.cc/bbs/car/M.1606791807.A.CA7....,[新聞]預計明年現身，ToyotaRAV4將推全新動力！,2020-12-01 11:03:24,yamatobar,car,原文連結：\nhttps://auto.ltn.com.tw/news/16610/3\n原...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""XXXXBANG"", ""...",1.171.168.195,2020-12-02 00:04:03,ptt
2,3,https://www.ptt.cc/bbs/car/M.1606791901.A.39C....,[情報]新世代Mazda2/CX-3有望沿用Yaris平台,2020-12-01 11:04:57,oppoR20,car,新一代Mazda 2有望直接沿用Yaris平台，CX3也可能直接辦理！\nhttps://w...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""wang960615"",...",140.125.222.17,2020-12-02 00:04:03,ptt


In [106]:
# 看看有幾篇文章
print(f"number of posts: {ptt.shape[0]}")
print(f"date range: {(ptt['artDate'].min(), ptt['artDate'].max())}")
print(f"category: \n{ptt['artCatagory'].value_counts()}")


number of posts: 3321
date range: ('2020-12-01 00:09:42', '2023-01-30 21:30:53')
category: 
artCatagory
CarShop    1777
car        1544
Name: count, dtype: int64


### 留言萃取
取出`artComment`的`cmtContent`

In [107]:
ptt = ptt[ptt.artComment != '[]'] # 刪除沒有comment的文章

# 取出 commentContent
def getComtInfo(com):
    cmtContent = ""
    com = eval(com)
    # print(com)
    for i in com:
        # print(i)
        cmtContent += i['cmtContent'] + "。"
    return pd.Series([cmtContent])


ptt[['cmtContent']] = ptt['artComment'].apply(lambda r: getComtInfo(r))
ptt.head(3)

Unnamed: 0,system_id,artUrl,artTitle,artDate,artPoster,artCatagory,artContent,artComment,e_ip,insertedDate,dataSource,cmtContent
0,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,[新聞]小休旅熱鬧好玩PEUGEOT30081.5LBlueH,2020-12-01 00:09:42,city0504,car,原文連結：\nhttps://ctee.com.tw/lohas/car/378518.ht...,"[{""cmtStatus"": ""→"", ""cmtPoster"": ""mingchaoliu""...",111.243.121.95,2020-12-01 00:04:07,ptt,:要看樓主住哪或開車活動範圍在哪？西部是還好啦。:一年是可以去保養廠幾次。:3008快小改款...
1,2,https://www.ptt.cc/bbs/car/M.1606791807.A.CA7....,[新聞]預計明年現身，ToyotaRAV4將推全新動力！,2020-12-01 11:03:24,yamatobar,car,原文連結：\nhttps://auto.ltn.com.tw/news/16610/3\n原...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""XXXXBANG"", ""...",1.171.168.195,2020-12-02 00:04:03,ptt,:電動浴缸？。:防水電動車。:哈哈。:原廠有打算出敞篷RAV4嗎?。:RAV250h？？？。...
2,3,https://www.ptt.cc/bbs/car/M.1606791901.A.39C....,[情報]新世代Mazda2/CX-3有望沿用Yaris平台,2020-12-01 11:04:57,oppoR20,car,新一代Mazda 2有望直接沿用Yaris平台，CX3也可能直接辦理！\nhttps://w...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""wang960615"",...",140.125.222.17,2020-12-02 00:04:03,ptt,:北美的YARIS停產了。:但是這篇內文的東西總感覺可能全球發售。:畢竟馬二的銷量除了日本本...


### 資料清理

In [108]:
# 清除空值
ptt.dropna(subset=['artContent'], axis=0, how='any', inplace=True)
ptt.dropna(subset=['artTitle'], axis=0, how='any', inplace=True)
ptt.dropna(subset=['cmtContent'], axis=0, how='any', inplace=True)

# 用'。'取代'\n\n'，並移除'\n'
ptt = ptt.replace(r'[\n]+', '。', regex=True)

# 移除內文中的網址
ptt['artContent'] = ptt['artContent'].str.replace('(http|https)://.*', '', regex=True).replace(r'www\S+', '', regex=True)
ptt['artTitle'] = ptt['artTitle'].str.replace('(http|https)://.*', '', regex=True).replace(r'www\S+', '', regex=True)
ptt['cmtContent'] = ptt['cmtContent'].str.replace('(http|https)://.*', '', regex=True).replace(r'www\S+', '', regex=True)


ptt

Unnamed: 0,system_id,artUrl,artTitle,artDate,artPoster,artCatagory,artContent,artComment,e_ip,insertedDate,dataSource,cmtContent
0,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,[新聞]小休旅熱鬧好玩PEUGEOT30081.5LBlueH,2020-12-01 00:09:42,city0504,car,原文連結：。,"[{""cmtStatus"": ""→"", ""cmtPoster"": ""mingchaoliu""...",111.243.121.95,2020-12-01 00:04:07,ptt,:要看樓主住哪或開車活動範圍在哪？西部是還好啦。:一年是可以去保養廠幾次。:3008快小改款...
1,2,https://www.ptt.cc/bbs/car/M.1606791807.A.CA7....,[新聞]預計明年現身，ToyotaRAV4將推全新動力！,2020-12-01 11:03:24,yamatobar,car,原文連結：。,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""XXXXBANG"", ""...",1.171.168.195,2020-12-02 00:04:03,ptt,:電動浴缸？。:防水電動車。:哈哈。:原廠有打算出敞篷RAV4嗎?。:RAV250h？？？。...
2,3,https://www.ptt.cc/bbs/car/M.1606791901.A.39C....,[情報]新世代Mazda2/CX-3有望沿用Yaris平台,2020-12-01 11:04:57,oppoR20,car,新一代Mazda 2有望直接沿用Yaris平台，CX3也可能直接辦理！。,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""wang960615"",...",140.125.222.17,2020-12-02 00:04:03,ptt,:北美的YARIS停產了。:但是這篇內文的東西總感覺可能全球發售。:畢竟馬二的銷量除了日本本...
3,4,https://www.ptt.cc/bbs/car/M.1606792933.A.D92....,Re:[情報]新世代Mazda2/CX-3有望沿用Yaris平台,2020-12-01 11:22:11,jason89514,car,這是依照之前 Mazda Q3 財報會上所公佈的資訊。,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""JOHN2188"", ""...",114.43.130.93,2020-12-02 00:04:03,ptt,:進口、最新1.5油電、LV2，很香啊。
4,5,https://www.ptt.cc/bbs/car/M.1606794935.A.AA7....,[情報]2020年11月份臺灣汽車市場銷售報告,2020-12-01 11:55:33,jerrysuper,car,新增小七車。,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""XXXXBANG"", ""...",111.251.90.133,2020-12-02 00:04:03,ptt,:反指標板真的狂風向反著吹。:Kuga贏CRV。:雨衣廠商連帶受惠。:看車版以為RAV4銷量...
...,...,...,...,...,...,...,...,...,...,...,...,...
3316,3317,https://www.ptt.cc/bbs/CarShop/M.1674922265.A....,[購車]全新MazdaCX3020SCarbonEdition,2023-01-29 00:11:03,chaoyoyo,CarShop,車輛狀況：2023 全新。車輛品牌：Mazda。車款型式：CX30 20S Carbon。車...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""a7426891"", ""...",115.43.31.123,2023-01-29 01:03:07,ptt,:鐵灰+1。:+1。:北部鐵灰+1。:台南+1。
3317,3318,https://www.ptt.cc/bbs/CarShop/M.1675004691.A....,[購車]MazdaCX-520SPremiumSE,2023-01-29 23:04:49,Haskell,CarShop,車輛狀況：全新。車輛品牌：Mazda。車款型式：CX-5 20S Premium SE。車輛...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""UPIO"", ""cmtC...",36.228.84.16,2023-01-30 01:03:03,ptt,:+1。:+1。:+1。:+1。
3318,3319,https://www.ptt.cc/bbs/CarShop/M.1675041873.A....,[購車]2023MazdaCX-520SPremiumSE,2023-01-30 09:24:31,UPIO,CarShop,車輛狀況：全新。車輛品牌：Mazda。車款型式：2023 Mazda CX-5 20S Pr...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""rain062811"",...",122.116.29.47,2023-01-31 01:03:07,ptt,:+1。:+1。:+1。:+1。:+1。:+1。:+1。
3319,3320,https://www.ptt.cc/bbs/CarShop/M.1675050698.A....,[購車]Mazda35D2023年式20SSignature/Prem,2023-01-30 11:51:36,zx6226880,CarShop,車輛狀況：全新。 。車輛品牌：Mazda3。 。車款型式：Mazda3 5D 2023年式 ...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""s77329"", ""cm...",223.138.72.252,2023-01-31 01:03:07,ptt,:高雄台南+1，謝謝。:中部+1謝謝。:北部+1。:雙北+1。:北部&宜蘭+1。:高雄台南4...


將 `artTitle`, `artContent`, `cmtContent` 合併成新欄位 `whole_content`

In [109]:
ptt['whole_content'] = ptt['artTitle'] + ptt['artContent'] + ptt['cmtContent']
ptt = ptt[['system_id', 'artUrl', 'artDate', 'artCatagory', 'whole_content']]
ptt

Unnamed: 0,system_id,artUrl,artDate,artCatagory,whole_content
0,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01 00:09:42,car,[新聞]小休旅熱鬧好玩PEUGEOT30081.5LBlueH原文連結：。:要看樓主住哪或開...
1,2,https://www.ptt.cc/bbs/car/M.1606791807.A.CA7....,2020-12-01 11:03:24,car,[新聞]預計明年現身，ToyotaRAV4將推全新動力！原文連結：。:電動浴缸？。:防水電動...
2,3,https://www.ptt.cc/bbs/car/M.1606791901.A.39C....,2020-12-01 11:04:57,car,[情報]新世代Mazda2/CX-3有望沿用Yaris平台新一代Mazda 2有望直接沿用Y...
3,4,https://www.ptt.cc/bbs/car/M.1606792933.A.D92....,2020-12-01 11:22:11,car,Re:[情報]新世代Mazda2/CX-3有望沿用Yaris平台這是依照之前 Mazda Q...
4,5,https://www.ptt.cc/bbs/car/M.1606794935.A.AA7....,2020-12-01 11:55:33,car,[情報]2020年11月份臺灣汽車市場銷售報告新增小七車。:反指標板真的狂風向反著吹。:Ku...
...,...,...,...,...,...
3316,3317,https://www.ptt.cc/bbs/CarShop/M.1674922265.A....,2023-01-29 00:11:03,CarShop,[購車]全新MazdaCX3020SCarbonEdition車輛狀況：2023 全新。車輛...
3317,3318,https://www.ptt.cc/bbs/CarShop/M.1675004691.A....,2023-01-29 23:04:49,CarShop,[購車]MazdaCX-520SPremiumSE車輛狀況：全新。車輛品牌：Mazda。車款...
3318,3319,https://www.ptt.cc/bbs/CarShop/M.1675041873.A....,2023-01-30 09:24:31,CarShop,[購車]2023MazdaCX-520SPremiumSE車輛狀況：全新。車輛品牌：Mazd...
3319,3320,https://www.ptt.cc/bbs/CarShop/M.1675050698.A....,2023-01-30 11:51:36,CarShop,[購車]Mazda35D2023年式20SSignature/Prem車輛狀況：全新。 。車...


`artDate` 日期格式轉換

In [110]:
ptt["artDate"] = pd.to_datetime(ptt["artDate"])
ptt["artDate"] = ptt["artDate"].dt.date
ptt

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ptt["artDate"] = pd.to_datetime(ptt["artDate"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ptt["artDate"] = ptt["artDate"].dt.date


Unnamed: 0,system_id,artUrl,artDate,artCatagory,whole_content
0,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,[新聞]小休旅熱鬧好玩PEUGEOT30081.5LBlueH原文連結：。:要看樓主住哪或開...
1,2,https://www.ptt.cc/bbs/car/M.1606791807.A.CA7....,2020-12-01,car,[新聞]預計明年現身，ToyotaRAV4將推全新動力！原文連結：。:電動浴缸？。:防水電動...
2,3,https://www.ptt.cc/bbs/car/M.1606791901.A.39C....,2020-12-01,car,[情報]新世代Mazda2/CX-3有望沿用Yaris平台新一代Mazda 2有望直接沿用Y...
3,4,https://www.ptt.cc/bbs/car/M.1606792933.A.D92....,2020-12-01,car,Re:[情報]新世代Mazda2/CX-3有望沿用Yaris平台這是依照之前 Mazda Q...
4,5,https://www.ptt.cc/bbs/car/M.1606794935.A.AA7....,2020-12-01,car,[情報]2020年11月份臺灣汽車市場銷售報告新增小七車。:反指標板真的狂風向反著吹。:Ku...
...,...,...,...,...,...
3316,3317,https://www.ptt.cc/bbs/CarShop/M.1674922265.A....,2023-01-29,CarShop,[購車]全新MazdaCX3020SCarbonEdition車輛狀況：2023 全新。車輛...
3317,3318,https://www.ptt.cc/bbs/CarShop/M.1675004691.A....,2023-01-29,CarShop,[購車]MazdaCX-520SPremiumSE車輛狀況：全新。車輛品牌：Mazda。車款...
3318,3319,https://www.ptt.cc/bbs/CarShop/M.1675041873.A....,2023-01-30,CarShop,[購車]2023MazdaCX-520SPremiumSE車輛狀況：全新。車輛品牌：Mazd...
3319,3320,https://www.ptt.cc/bbs/CarShop/M.1675050698.A....,2023-01-30,CarShop,[購車]Mazda35D2023年式20SSignature/Prem車輛狀況：全新。 。車...


### 替代字串

In [111]:
replace = pd.read_csv('../dict/replace.csv')
replace_dict = {key: '' for key in replace['alias']}

In [112]:
def replace_str(data):
    for old, new in replace_dict.items():
        data = data.replace(old, new)
    return data

In [113]:
replace_df = ptt.copy()
replace_df['whole_content'] = replace_df['whole_content'].apply(lambda x : replace_str(x))
replace_df

Unnamed: 0,system_id,artUrl,artDate,artCatagory,whole_content
0,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,[新聞]小休旅熱鬧好玩PEUGEOT30081.5LBlueH：。:要看樓主住哪或開車活動範...
1,2,https://www.ptt.cc/bbs/car/M.1606791807.A.CA7....,2020-12-01,car,[新聞]預計明年現身，ToyotaRAV4將推全新動力！：。:電動浴缸？。:防水電動車。:哈...
2,3,https://www.ptt.cc/bbs/car/M.1606791901.A.39C....,2020-12-01,car,[情報]新世代Mazda2/CX-3有望沿用Yaris平台新一代Mazda 2有望直接沿用Y...
3,4,https://www.ptt.cc/bbs/car/M.1606792933.A.D92....,2020-12-01,car,Re:[情報]新世代Mazda2/CX-3有望沿用Yaris平台這是依照之前 Mazda Q...
4,5,https://www.ptt.cc/bbs/car/M.1606794935.A.AA7....,2020-12-01,car,[情報]2020年11月份臺灣汽車市場銷售報告新增小七車。:反指標板真的狂風向反著吹。:Ku...
...,...,...,...,...,...
3316,3317,https://www.ptt.cc/bbs/CarShop/M.1674922265.A....,2023-01-29,CarShop,[購車]全新MazdaCX3020SCarbonEdition：2023 全新。車輛品牌：M...
3317,3318,https://www.ptt.cc/bbs/CarShop/M.1675004691.A....,2023-01-29,CarShop,[購車]MazdaCX-520SPremiumSE：全新。車輛品牌：Mazda。：CX-5 ...
3318,3319,https://www.ptt.cc/bbs/CarShop/M.1675041873.A....,2023-01-30,CarShop,[購車]2023MazdaCX-520SPremiumSE：全新。車輛品牌：Mazda。：2...
3319,3320,https://www.ptt.cc/bbs/CarShop/M.1675050698.A....,2023-01-30,CarShop,[購車]Mazda35D2023年式20SSignature/Prem：全新。 。車輛品牌：...


### 斷句

In [114]:
replace_df['whole_content'] = replace_df['whole_content'].str.split("[,，。！!？?]{1,}")



# 重新命名欄位
replace_df = replace_df.rename(columns={'whole_content': 'sentence'})
replace_df = replace_df.explode('sentence').reset_index(drop=True)

# 把長度小於1的sentence刪掉
sentence_df = replace_df[replace_df["sentence"].str.len() > 1]
sentence_df

Unnamed: 0,system_id,artUrl,artDate,artCatagory,sentence
0,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,[新聞]小休旅熱鬧好玩PEUGEOT30081.5LBlueH：
1,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,:要看樓主住哪或開車活動範圍在哪
2,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,西部是還好啦
3,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,:一年是可以去保養廠幾次
4,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,:3008快小改款了
...,...,...,...,...,...
196456,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,2. 詳細菜單內容
196457,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,3. 名片/Line
196458,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,:+1
196459,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,:北部+1


### 斷詞
初始化斷詞引擎

In [115]:
jieba.set_dictionary('../dict/dict.txt')
jieba.load_userdict('../dict/user_dict.txt')

Building prefix dict from d:\Projects\NSYSU\2023_BigDataAnalysis\dict\dict.txt ...
Loading model from cache C:\Users\s2568\AppData\Local\Temp\jieba.uaa528441c6063f69433245c0db13322d.cache
Loading model cost 1.109 seconds.
Prefix dict has been built successfully.


先清除標點符號及空字串

In [116]:
clear_df = sentence_df.copy()

# # 用'。'取代'\n\n'，並移除'\n'
# clear_df = clear_df.replace(r'\\n\\n', '。', regex=True).replace(r'\n', '', regex=True)
clear_df['sentence'] = clear_df['sentence'].str.replace(r'[^\w\s\d]+', '', regex=True).astype(str)
clear_df = clear_df[clear_df["sentence"].str.len() > 1]

clear_df

Unnamed: 0,system_id,artUrl,artDate,artCatagory,sentence
0,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,新聞小休旅熱鬧好玩PEUGEOT300815LBlueH
1,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,要看樓主住哪或開車活動範圍在哪
2,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,西部是還好啦
3,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,一年是可以去保養廠幾次
4,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,3008快小改款了
...,...,...,...,...,...
196454,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,其它需求
196455,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,1 現金空車最大折價
196456,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,2 詳細菜單內容
196457,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,3 名片Line


進行jieba斷詞

In [117]:
# 設定繁體中文詞庫
jieba.set_dictionary("../dict/dict.txt.big")

stopwords_manual = ['恭喜', '有無', '有人', '是不是', '本來', '遇到', '機車', '時間', '討論', '10', 'XD', '20', '未來', '現在', '今年']

# 新增stopwords
with open("../dict/stopwords.txt", encoding="utf-8") as f:
    stopWords = [line.strip() for line in f.readlines()]
stopWords.extend(stopwords_manual)

# 設定斷詞 function
def getToken(row):
    if not isinstance(row, str):  # 檢查類型是否為字串
        row = str(row)  # 將非字串類型轉換為字串
    seg_list = jieba.cut(row, cut_all=False)
    seg_list = [
        w for w in seg_list if w not in stopWords and len(w) > 1
    ]  # 篩選掉停用字與字元數大於1的詞彙
    return seg_list

clear_df["words"] = clear_df["sentence"].apply(getToken)
clear_df

Building prefix dict from d:\Projects\NSYSU\2023_BigDataAnalysis\dict\dict.txt.big ...
Loading model from cache C:\Users\s2568\AppData\Local\Temp\jieba.u87526c01a2c6093fa84ac3f5467b7506.cache


Loading model cost 2.602 seconds.
Prefix dict has been built successfully.


Unnamed: 0,system_id,artUrl,artDate,artCatagory,sentence,words
0,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,新聞小休旅熱鬧好玩PEUGEOT300815LBlueH,"[新聞, 小休, 熱鬧, 好玩, PEUGEOT300815LBlueH]"
1,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,要看樓主住哪或開車活動範圍在哪,"[樓主, 開車, 活動, 範圍]"
2,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,西部是還好啦,"[西部, 還好]"
3,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,一年是可以去保養廠幾次,"[保養, 幾次]"
4,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,3008快小改款了,"[3008, 改款]"
...,...,...,...,...,...,...
196454,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,其它需求,[需求]
196455,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,1 現金空車最大折價,"[現金, 空車, 最大, 折價]"
196456,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,2 詳細菜單內容,"[詳細, 菜單, 內容]"
196457,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,3 名片Line,"[名片, Line]"


## 情緒分析
利用`LIWC`進行情緒分析
+ sentiment 計算方式: positive - anger - anx - negative - sad

In [118]:
senti_df = clear_df.copy()

In [119]:
# 讀取情緒字典
liwc_dict = pd.read_csv("../dict/liwc/LIWC_CH.csv")
liwc_dict = liwc_dict.rename(columns={'name': 'word', "class": 'sentiments'})
liwc_dict = liwc_dict.set_index('word')['sentiments'].to_dict()
# liwc_dict

In [120]:
def get_sentiment(words, liwc_dict):
    sentiments_value = 0
    for word in words:
        if word in liwc_dict:
            if (liwc_dict[word] == "positive"):
                sentiments_value += 1
            else:
                sentiments_value -= 1
        else:
            continue
    return sentiments_value

In [121]:
# 幫每句話加上情緒分數
senti_df['sentimentValue'] = senti_df.apply(lambda row : get_sentiment(row['words'],liwc_dict), axis = 1)
senti_df

Unnamed: 0,system_id,artUrl,artDate,artCatagory,sentence,words,sentimentValue
0,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,新聞小休旅熱鬧好玩PEUGEOT300815LBlueH,"[新聞, 小休, 熱鬧, 好玩, PEUGEOT300815LBlueH]",0
1,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,要看樓主住哪或開車活動範圍在哪,"[樓主, 開車, 活動, 範圍]",0
2,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,西部是還好啦,"[西部, 還好]",0
3,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,一年是可以去保養廠幾次,"[保養, 幾次]",0
4,1,https://www.ptt.cc/bbs/car/M.1606752584.A.175....,2020-12-01,car,3008快小改款了,"[3008, 改款]",0
...,...,...,...,...,...,...,...
196454,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,其它需求,[需求],0
196455,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,1 現金空車最大折價,"[現金, 空車, 最大, 折價]",0
196456,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,2 詳細菜單內容,"[詳細, 菜單, 內容]",0
196457,3321,https://www.ptt.cc/bbs/CarShop/M.1675085457.A....,2023-01-30,CarShop,3 名片Line,"[名片, Line]",0


In [122]:
# 看一下情緒的分布
senti_df['sentimentValue'].describe()

count    184622.000000
mean          0.026389
std           0.417713
min          -3.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           4.000000
Name: sentimentValue, dtype: float64

## 儲存結果

In [123]:
senti_df.to_csv("../data/sentiment/mazda_clean_data.csv", encoding = 'utf-8',index = False)