In [1]:
import pandas as pd
import numpy as np

from langdetect import detect, DetectorFactory
import langid

DetectorFactory.seed = 42
#from transformers import pipeline

In [2]:
review_df_all_deduplicate = pd.read_csv('./dataset/review_df_all_deduplicate_all_language.csv', index_col = 0)

# Use langdetect

In [3]:
def detect_english_langdetect(text):
    try:
        return detect(text) == 'en'
    except:
        return False
    
mask = review_df_all_deduplicate['review_text'].apply(detect_english_langdetect)

In [4]:
review_df_all_deduplicate_english = review_df_all_deduplicate.loc[mask]
review_df_all_deduplicate_non_english = review_df_all_deduplicate.loc[~mask]

Notice that langdetct mis-classified English reviews to non-English.

In [5]:
review_df_all_deduplicate_non_english['review_text']

14082                                love it! smells amazing
35349                                   amazing moisturizer!
37928                               best. moisturizer. ever.
53086                                   amazing moisturizer!
53176                               best. moisturizer. ever.
                                 ...                        
1043654    meilleure huile , peau sensible et rougeurs. l...
1056732    antes de usar maquillaje me gusta humectar muy...
1056888    antes de usar maquillaje me gusta humectar muy...
1064336    my skin just looks better, healthier . i’m alm...
1064791    my skin just looks better, healthier . i’m alm...
Name: review_text, Length: 942, dtype: object

# Use langid check

For the English reviews detected by langdetect, we use langid to double check any discrepancy. 

In [6]:
def detect_english_landid(text):
    try:
        language, confidence = langid.classify(text)
        return language == 'en'
    except:
        return False

test = review_df_all_deduplicate_english['review_text'].apply(detect_english_landid)

In [7]:
print('English reviews agreed by landid: ', test.sum())
print('Total English reviews detected by landdetect: ', review_df_all_deduplicate_english.shape[0])

English reviews agreed by landid:  240687
Total English reviews detected by landdetect:  240885


Disagreement part:

[58307, 64476]: English + Chinese, remove chinese part

[263543, 744195]: non-English review

In [8]:
review_df_all_deduplicate_english.loc[[58307, 64476], 'review_text'] = (
    review_df_all_deduplicate_english.loc[[58307, 64476], 'review_text']
    .str.replace(r"[^\x00-\x7F].*$", "", regex=True)
    .str.strip()
)

idx_remove = [263543, 744195, 266493] # 266493 addiational
df_final_part_1 = review_df_all_deduplicate_english.drop(index=idx_remove, errors="ignore")  

# Non-English Reviews Fix

To fix the mis-classified English reviews, we utilize LLM to help detect. The detail is omiited, we only import the LLM suggested mis-classified English review index.

In [9]:
review_df_all_deduplicate_non_english['review_text'].to_csv('./dataset/ambiguous_non_english_review_text.csv', index=True, encoding="utf-8")

In [10]:
idx_english_llm_suggested = pd.read_csv('./dataset/english_indices.csv')
idx_english_llm_suggested = idx_english_llm_suggested['index'].tolist()

In [11]:
idx_english_llm_suggested = pd.read_csv('./dataset/english_indices.csv')
idx_english_llm_suggested = idx_english_llm_suggested['index'].tolist()

idx_exclude = [55154, 57442, 58690, 61323, 63611, 64859, 73800, 75370, 79664, 81234, 96976, 100412, 107211,

110647, 119185, 119879, 123785, 124479, 148361, 148397, 150671, 152737, 152773, 155047, 164100, 168270, 

206021,  209342, 216052, 219320, 264115, 267064, 282563, 285346, 305041, 307558, 311974, 314457, 340072,

342484, 344896, 348055, 350419, 353241, 355596, 455273, 456959, 585536, 586732, 593046, 580708, 581943,

594219, 613859, 614958, 697222, 698135, 719799, 733708, 844986, 852016, 939008, 939355, 1031253, 1031587
]

idx_english_llm_suggested = np.array(idx_english_llm_suggested)[~np.isin(idx_english_llm_suggested, idx_exclude)]

剔除：

[55154, 57442, 58690, 61323, 63611, 64859, 73800, 75370, 79664, 81234, 96976, 100412, 107211,

110647, 119185, 119879, 123785, 124479, 148361, 148397, 150671, 152737, 152773, 155047, 164100, 168270, 

206021,  209342, 216052, 219320, 264115, 267064, 282563, 285346, 305041, 307558, 311974, 314457, 340072,

342484, 344896, 348055, 350419, 353241, 355596, 455273, 456959, 585536, 586732, 593046, 580708, 581943,

594219, 613859, 614958, 697222, 698135, 719799, 733708, 844986, 852016, 939008, 939355, 1031253, 1031587
]


hastag, @..., 标点符号重复 e.g. like it!!!!!!!!!, pure url, 单词某字母重复 e.g. loveeeeee, 乱码 e.g. jtssngdbkgiiycycitctiheckljjioooooo

In [12]:
df_final_part_2 = review_df_all_deduplicate_non_english.loc[idx_english_llm_suggested]

# Concatenate Final dataframe

In [None]:
df = pd.concat([df_final_part_1, df_final_part_2])
df.to_csv("./dataset/review_df_all_deduplicate_english.csv", index = True, encoding = 'utf-8')