In [19]:
# import relevant libraries

import pandas as pd
import numpy as np

from langdetect import detect, DetectorFactory
import langid

DetectorFactory.seed = 42

In [20]:
review_df_all_deduplicate = pd.read_csv('./dataset/review_df_all_deduplicate_all_language.csv', index_col = 0)

# Use langdetect

In [3]:
def detect_english_langdetect(text):
    try:
        return detect(text) == 'en'
    except:
        return False
    
mask = review_df_all_deduplicate['review_text'].apply(detect_english_langdetect)

In [4]:
review_df_all_deduplicate_english = review_df_all_deduplicate.loc[mask]
review_df_all_deduplicate_non_english = review_df_all_deduplicate.loc[~mask]

In [5]:
review_df_all_deduplicate_english.to_csv("./dataset/review_df_all_deduplicate_english.csv", index = True, encoding = 'utf-8')
review_df_all_deduplicate_non_english.to_csv("./dataset/ambiguous_non_english_review_text.csv", index = True, encoding = 'utf-8')

Notice that langdetct mis-classified English reviews to non-English.

In [6]:
review_df_all_deduplicate_non_english['review_text']

34207                          My favorite moisturizer ever!
34378                                   absolute love cream.
34497        Does soak in well, but doesn’t give a dewy glow
34695      Es simplemente maravillosa!!!! Super recomendada.
34769      Super hydrating but doesn’t make your skin oil...
                                 ...                        
1031570    J’adore ce produit pour ma peau déshydratée! M...
1031587    J’adore ce produit, il hydrate et affine le gr...
1043654    Meilleure huile , peau sensible et rougeurs. L...
1056732    Antes de usar maquillaje me gusta humectar muy...
1056888    Antes de usar maquillaje me gusta humectar muy...
Name: review_text, Length: 3527, dtype: object

We use LLM assist us to classify the ambiguous reviews, the detail is omiited, here we directly import the processed data. 

In [21]:
review_df_all_deduplicate_non_english = pd.read_csv('./dataset/ambiguous_non_english_review_text.csv', encoding='latin1', index_col=0)

english_idx = review_df_all_deduplicate_non_english[review_df_all_deduplicate_non_english['label'] == 'English-only'].index

# Use langid check

For the English reviews detected by langdetect, we use langid to double check any discrepancy. 

In [7]:
def detect_english_landid(text):
    try:
        language, confidence = langid.classify(text)
        return language == 'en'
    except:
        return False

test = review_df_all_deduplicate_english['review_text'].apply(detect_english_landid)

In [8]:
print('English reviews agreed by landid: ', test.sum())
print('Total English reviews detected by landdetect: ', review_df_all_deduplicate_english.shape[0])

English reviews agreed by landid:  886979
Total English reviews detected by landdetect:  887749


In [13]:
review_df_all_deduplicate_english.loc[~test, 'review_text'].to_csv("./dataset/disagreement_part.csv", index = True, encoding = 'utf-8')

In [14]:
review_df_all_deduplicate_english.loc[test, 'review_text'].to_csv("./dataset/agreement_part.csv", index = True, encoding = 'utf-8')

In [39]:
review_df_all_deduplicate_english_agree = pd.read_csv("./dataset/agreement_part.csv", index_col=0)
review_df_all_deduplicate_english_disagree = pd.read_csv("./dataset/disagreement_part.csv", encoding='latin1', index_col=0)

english_idx_1 = review_df_all_deduplicate_english_disagree[review_df_all_deduplicate_english_disagree['label'] == 'English-only'].index
english_idx_2 = review_df_all_deduplicate_english_agree.index

In [51]:
english_idx_all = np.concatenate([english_idx.values, english_idx_1.values, english_idx_2.values])

# Store Final dataframe

In [55]:
review_df_all_deduplicate_new = review_df_all_deduplicate.loc[english_idx_all]

In [58]:
review_df_all_deduplicate_new.to_csv("./dataset/review_df_all_deduplicate_english.csv", index = True, encoding = 'utf-8')