### 此檔案的前處理主要包含兩個部分
#### 1.篩選出同一用戶至少有六筆以上評論的所有資料 select_over6_review
#### 2.針對review的內容作資料前處理

In [None]:
#讀取資料夾底下所有特定副檔名之檔名，並只留下檔名(移除副檔名)
#若沒有重新Google plat的資料這裡不用重新跑，若有重新爬取才會用到

import os

fileDir = r"./Data/games_reviews"
fileExt = r".csv"
filelist= [_ for _ in os.listdir(fileDir) if _.endswith(fileExt)]

filenamelist = []
for i,g in enumerate(filelist):
    filenamelist.append(os.path.splitext(g)[0]) 

filenamelist

In [None]:
# 計算每個使用者出現次數

import pandas as pd
import numpy as np
import os
from time import sleep
from tqdm import tqdm, trange

fileDir = './Data/games_reviews'
fileExt = '.csv'
filelist= [_ for _ in os.listdir(fileDir) if _.endswith(fileExt)]

dic_userID = {}

for i,file in enumerate(tqdm(filelist)):
    df = pd.read_csv(os.path.join(fileDir,file))
    
    for image_url in df['userImage']:
        # dictionary.get()是當找不到key時會傳我們設定的False(預設為None)
        if dic_userID.get(image_url,'False') != 'False':
            dic_userID[image_url] += 1
        else:
            dic_userID[image_url] = 1

In [None]:
# 找到review小於6的所有人，並且同時刪除所有使用預設圖片的用戶做的評論(即下列url)
less_6_review =['https://play-lh.googleusercontent.com/EGemoI2NTXmTsBVtJqk8jxF9rh8ApRWfsIMQSt2uE4OcpQqbFu7f7NbTK05lx80nuSijCz7sc3a277R67g'] 
for key, value in dic_userID.items():
    if value < 6:
        less_6_review.append(key)
        
print(len(dic_userID))

# 把評論次數小於6的刪除
[dic_userID.pop(k) for k in less_6_review]
print(len(dic_userID)) #刪掉之後

In [None]:
# 按用戶評論筆數多寡排序
# 對字典按value排序
list_sorted = sorted(dic_userID.items(), key=lambda x: x[1], reverse=True)

In [None]:
# 統計每個人留言次數後寫入csv 
# userID_index.csv 太大無法開啟 / userID_index_over6.csv 記錄哪些用戶留言超過10次
with open("./Data/userID_index_over6.csv", mode = 'w',encoding='utf-8',newline='') as f:
        df = pd.DataFrame(columns=['userImage','times'],data=list_sorted)
        df.to_csv(f , index = True , encoding='utf-8') 

# mode = 'w'会清空文件再重新写入新的数据，加上mode='a'，便可以追加写入数据。

In [None]:
#讀取已紀錄留言>=6次之使用者
df2 = pd.read_csv("./Data/userID_index_over6.csv")
df2.head()

In [None]:
fileDir = './Data/games_reviews'
fileExt = '.csv'
filelist= [_ for _ in os.listdir(fileDir) if _.endswith(fileExt)]

#建立新的空dataframe
new_df = pd.DataFrame()

for i,file in enumerate(tqdm(filelist)):

    df = pd.read_csv(os.path.join(fileDir,file))
    
    df = (df[df["userImage"].isin(df2['userImage'])])
    
    #解決部分csv有這欄位，但有些沒有 的問題
    if {'appIndex','appId'}.issubset(df.columns):
        df = df.drop(columns=['appIndex','appId'],axis=1)
        
    appId = os.path.splitext(file)[0]    
    
    df.insert(df.shape[1],'appIndex', i)
    df.insert(df.shape[1],'appId', appId)
    
    new_df = pd.concat([new_df,df])

In [None]:
print(new_df.shape[0])
print(new_df.loc[0]['content'])

In [None]:
# 刪除評分/評論任一者為空的資料 (這裡直接刪掉所有欄位中有空的)
print(new_df.shape)

new_df = new_df.dropna()
print(new_df.shape)

In [None]:
# 檢查重複
new_df.duplicated()
new_df.drop_duplicates()
new_df.to_csv('./Data/allreview.csv',index=False, encoding='utf-8') 

# allreview.csv 在所有爬回來的資料內把評論次數>=6的人都撈出來，並重新給予appId和appIndex。

#### 以下針對review內容進行前處理

In [None]:
import pandas as pd
import os
import re
from itertools import chain

# -*- coding: utf-8 -*-
from time import sleep
from tqdm import tqdm, trange

TAG_RE = re.compile(r'<[^>]+>')
def preprocess_text(sen):
    # Removing html tags
    sentence = TAG_RE.sub('', sen)
    # Remove punctuations and numbers
    sentence = re.sub(r"[^A-Za-z0-9]", " ", sentence)
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    sentence = re.sub(r"\'s", " \'s", sentence)
    sentence = re.sub(r"\'ve", " \'ve", sentence)
    sentence = re.sub(r"n\'t", " n\'t", sentence)
    sentence = re.sub(r"\'re", " \'re", sentence)
    sentence = re.sub(r"\'d", " \'d", sentence)
    sentence = re.sub(r"\'ll", " \'ll", sentence)
    sentence = re.sub(r",", " , ", sentence)
    sentence = re.sub(r"!", " ! ", sentence)
    sentence = re.sub(r"\(", " \( ", sentence)
    sentence = re.sub(r"\)", " \) ", sentence)
    sentence = re.sub(r"\?", " \? ", sentence)
    sentence = re.sub(r"\s{2,}", " ", sentence)
    sentence = re.sub(r"\s{2,}", " ", sentence)
    sentence = re.sub(r"sssss ", " ", sentence)

    return sentence

#去除多餘字後再用空格段詞且把空白刪除
def findErrorReview(new_df):
    need_drop=[]
    for i in tqdm(range(new_df.shape[0])):
        #去除多餘字
        review = preprocess_text(new_df.loc[i]['content'])
        #用空格斷詞且把空白元素刪除
        token = tokenizer(review)
        
        #如果經過上述處理完review全部不見即紀錄
        if len(token) < 1:
            need_drop.append(i)       
    return need_drop

# 檢查是否為空
def not_empty(s):
    return s and s.strip()

#取token
def tokenizer(text):
    token = [tok.lower() for tok in text.split(' ')] #簡單使用空格來斷詞
 #   刪除空白或None字元
    token = list(filter(not_empty,token))

    return token
    
errorReviewIndex = findErrorReview(new_df)
ALL_ORIGIN_DATA = new_df.drop(index = errorReviewIndex)
# ALL_ORIGIN_DATA.to_csv('./Data/ALL_ORIGIN_DATA.csv',index=False, encoding='utf-8') 

In [None]:
print(ALL_ORIGIN_DATA.shape)
print(ALL_ORIGIN_DATA.head())

#### 上面處理完評論後，可能額外出現一些空值，或使用戶評論數量不足6，因此以下重複檢查一次

In [2]:
#統計次數
dic_userID = {}
for image_url in ALL_ORIGIN_DATA['userImage']:
        # dictionary.get()是當找不到key時會傳我設定的False(預設為None)
        if dic_userID.get(image_url,'False') != 'False':
            dic_userID[image_url] += 1
        else:
            dic_userID[image_url] = 1

In [None]:
# 找到review小於6的所有人
less_6_review =[] 
for key, value in dic_userID.items():
    if value < 6:
        less_6_review.append(key)
        
print(less_6_review[:10])
print('ALL_ORIGIN_DATA before: ',ALL_ORIGIN_DATA.shape)

#複製ALL_ORIGIN_DATA
new_df4 = ALL_ORIGIN_DATA.copy()

# 把評論次數小於6的刪除
# 用~取反例，選取不包含less_6_review的資料
new_df4 = new_df4[~new_df4['userImage'].isin(less_6_review)]
print('ALL_ORIGIN_DATA after: ',new_df4.shape)

In [9]:
print(len(less_6_review))

# 刪除所有欄位中有空的資料
ALL_ORIGIN_DATA = new_df4.dropna()
print(ALL_ORIGIN_DATA.shape)

(1116196, 9)
(1116196, 9)


In [None]:
# 以下包含
# 1 用來檢查相關數據的量
# 2 加入＂userID的欄位＂到 ALL_ORIGIN_DATA 當中
# 3 存回 ALL_ORIGIN_DATA.csv

In [10]:
dic_userID = {}
dic_appID = {}

userId = [] #要新增到ALL_ORIGIN_DATA的欄位
user_li = [] #存 image_url,以使用它來取得對應的index (index從 0 開始)
count = 0
from tqdm import tqdm
# 一共len(ALL_ORIGIN_DATA)個，每次更新1
with tqdm(total=len(ALL_ORIGIN_DATA)) as pbar:
    for image_url,appId in ALL_ORIGIN_DATA[['userImage','appId']].values:
        
        # dictionary.get()是當找不到key時會傳我設定的False(預設為None)
        # 計算使用者評論幾次
        if dic_userID.get(image_url,'False') != 'False':
            dic_userID[image_url] += 1
            userId.append(user_li.index(image_url))
            
        else:
            dic_userID[image_url] = 1
            user_li.append(image_url)
            userId.append(count)
            # 先把userID新增好值再丟進去ALL_ORIGIN_DATA
            count += 1
        
        if dic_appID.get(appId,'False') != 'False':
            dic_appID[appId] += 1
        else:
            dic_appID[appId] = 1

        pbar.update(1)

print('總共幾人: ', len(dic_userID))
print('總共幾個APP: ', len(dic_appID))

ALL_ORIGIN_DATA['userId'] = userId
print(max(userId))
print(ALL_ORIGIN_DATA['userId'].head(80))

100%|███████████████████████████████████████████████████████████████████████| 1116196/1116196 [23:57<00:00, 776.49it/s]


總共幾人:  135331
總共幾個APP:  9095
135330
0      0
1      1
2      2
3      3
4      4
      ..
75    75
76    76
77    77
78    78
79    79
Name: userId, Length: 80, dtype: int64


In [11]:
# ALL_ORIGIN_DATA.csv 丟入各個 embedding 前的最終資料集，包含所有原始欄位與加入的index

# 加入user_index和review_index到ALL_ORIGIN_DATA.csv (即index=true)
ALL_ORIGIN_DATA.to_csv('./Data/ALL_ORIGIN_DATA.csv',index=True, encoding='utf-8') 
ALL_ORIGIN_DATA

Unnamed: 0.1,Unnamed: 0,reviewId,userName,userImage,content,score,appIndex,appId,userId
0,0,gp:AOqpTOGU42bo09lr6HIbEafb--V92oid3mlF7GbWTse...,charles archer,https://play-lh.googleusercontent.com/a-/AOh14...,Now I understand why so many low marks. Two ob...,1,5,aceviral.dragoncraft,0
1,1,gp:AOqpTOHaUdaHO8WWBNQ6zWy6It4mnQbahieXEUftnx7...,Jeremy English,https://play-lh.googleusercontent.com/a-/AOh14...,Its good but the adds to get more blocks wont ...,3,5,aceviral.dragoncraft,1
2,2,gp:AOqpTOHcu9AlSJzpEFryQhqoKLTngO1xRKsEOnDG4bu...,Wendy Barber,https://play-lh.googleusercontent.com/a-/AOh14...,I love the game my lil sis showed me it and iv...,5,5,aceviral.dragoncraft,2
3,3,gp:AOqpTOGZmYcOx0xGYjd7xSfIkvgrFo3KQHl3YEbql5G...,Terence Scott,https://play-lh.googleusercontent.com/a-/AOh14...,Good game. It deserves 5 stars because it isn'...,5,5,aceviral.dragoncraft,3
4,4,gp:AOqpTOH9ZmIq1ZaffiRxmr7hQSrefhxoVArLms6tq8L...,Kathey Whismendel,https://play-lh.googleusercontent.com/a/AATXAJ...,"Dear people, I lovvvvvvvve it but why do u hav...",5,5,aceviral.dragoncraft,4
...,...,...,...,...,...,...,...,...,...
1116191,1120733,gp:AOqpTOEtTZcqJDoWNVkbM5eb6u2uOFpkVf4nmtFxcCn...,Carlos Barreto,https://play-lh.googleusercontent.com/a/AATXAJ...,Useful features keep being removed. Now it was...,1,9684,_us_com.instagram.android,47487
1116192,1120734,gp:AOqpTOHHEbsM2cBfSax3UW6BPzNyFRYkmpqOBSmM2Fi...,Kira Shagrath,https://play-lh.googleusercontent.com/a-/AOh14...,Algorithm makes it impossible to grow your acc...,3,9684,_us_com.instagram.android,14297
1116193,1120735,gp:AOqpTOFHnUWsEvmvcvhcnDgVMOfe1KDmGWSw6Kb75Ve...,Audrey Wilkins,https://play-lh.googleusercontent.com/a/AATXAJ...,"This app is a lot of fun..I am new to it, but ...",5,9684,_us_com.instagram.android,38611
1116194,1120736,gp:AOqpTOGMczJFE1NFAlDAZVmnS-ciiyMhu27kTOsob_V...,Emmeric Conlan Reviews,https://play-lh.googleusercontent.com/a-/AOh14...,It's been an awesome app to use! As someone wh...,5,9684,_us_com.instagram.android,6009


#### 以下從最終的完整資料集 ALL_ORIGIN_DATA.csv 取出特定欄位，
#### 組成方便後續使用的幾個子檔案(便於加速後續檔案讀取)

In [12]:
# ALL_ORIGIN_DATA_all.csv 僅:index/評論/評分/appID/userID
# ALL_ORIGIN_DATA_review.csv 僅:index/評論/appID/userID
# ALL_ORIGIN_DATA_rating.csv 僅:index/評分/appID/userID

df1 = pd.DataFrame(columns=['review_index', 'userId', 'appId', 'rating', 'review'])
df1['review_index'] = df.index
df1['userId'] = df['userId']
df1['appId'] = df['appIndex']
df1['rating'] = df['score']
df1['review'] = df['content']
df1.to_csv('./Data/ALL_ORIGIN_DATA_all.csv',index=True, encoding='utf-8') 

df2 = pd.DataFrame(columns=['review_index', 'userId', 'appId', 'review'])
df2['review_index'] = df.index
df2['userId'] = df['userId']
df2['appId'] = df['appIndex']
df2['review'] = df['content']
df2.to_csv('./Data/ALL_ORIGIN_DATA_review.csv',index=True, encoding='utf-8') 

df3 = pd.DataFrame(columns=['review_index', 'userId', 'appId', 'rating'])
df3['review_index'] = df.index
df3['userId'] = df['userId']
df3['appId'] = df['appIndex']
df3['rating'] = df['score']
df3.to_csv('./Data/ALL_ORIGIN_DATA_rating.csv',index=True, encoding='utf-8') 

df3

Unnamed: 0,review_index,userId,appId,rating
0,0,0,5,1
1,1,1,5,3
2,2,2,5,5
3,3,3,5,5
4,4,4,5,5
...,...,...,...,...
1116191,1116191,47487,9684,1
1116192,1116192,14297,9684,3
1116193,1116193,38611,9684,5
1116194,1116194,6009,9684,5
