# Language Detection Lab

In [16]:
import numpy as np
from collections import Counter
import pandas as pd

from lingua import Language, LanguageDetectorBuilder
from genailab.infra.utils.file.io import IOService
from pandarallel import pandarallel

pandarallel.initialize(nb_workers=12, verbose=False)

In [17]:
languages = [Language.ENGLISH, Language.SPANISH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

In [18]:
fp = "workspace/dev/dataset/01_dataprep/appvocai_discover-01_dataprep-02_dqa-review-dataset.parquet"
col1 = "dqa_non_english_app_name"
cols2 = "dqa_non_english_review"

In [19]:
df = IOService.read(fp)
df.dqa_non_english_app_name.sum()

ArrowInvalid: Cannot yet unify dictionaries with nulls

## Lingua

In [5]:
def re_evaluate(text):
    try:
        return detector.detect_language_of(text) != Language.ENGLISH
    except Exception as e:
        print(f"Error in re-evaluation: {e}")
        return False

In [6]:
cols = [col for col in df.columns if col.startswith("dqa")]
df[cols].sum()

dqa_entropy                     193
dqa_duplicate_rows                0
dqa_duplicate_id                  0
dqa_duplicate_review           3282
dqa_has_null                      0
dqa_invalid_rating                0
dqa_non_english_review         2946
dqa_non_english_app_name       8989
dqa_has_emoji                  3490
dqa_excessive_special_chars     809
dqa_invalid_date                  0
dqa_has_profanity               573
dqa_contains_email                0
dqa_contains_url                  0
dqa_contains_phone_number        26
dtype: int64

In [18]:
# Apply re-evaluation only to rows where 'is_non_english' is True
df.loc[df[col1], col1] = df.loc[df[col1], "app_name"].parallel_apply(
    lambda text: re_evaluate(text)
)

In [19]:
df.loc[df["dqa_non_english_app_name"], "app_name"]
# data.loc[data["dqa_non_english_app_name"]].sum()

3          Golden Quran | المصحف الذهبي
18       قرآن هادی با ترجمه تفسیر فارسی
40           Аудиокниги - Без Интернета
44                             Prologue
55       قرآن هادی با ترجمه تفسیر فارسی
                      ...              
58833                            Gosund
58913                          小牛加速器vpn
58921                           Bitmoji
58944                  Mobily - موبايلي
59001      Carteira Digital de Trânsito
Name: app_name, Length: 1206, dtype: object

In [23]:
df.head()
df.info()
df.dqa_non_english_app_name.sum()

<class 'pandas.core.frame.DataFrame'>
Index: 59021 entries, 0 to 59020
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   id                           59021 non-null  int64   
 1   app_id                       59021 non-null  int64   
 2   app_name                     59021 non-null  object  
 3   category_id                  59021 non-null  int64   
 4   author                       59021 non-null  object  
 5   rating                       59021 non-null  int64   
 6   content                      59021 non-null  object  
 7   vote_sum                     59021 non-null  int64   
 8   vote_count                   59021 non-null  int64   
 9   date                         59021 non-null  object  
 10  dqa_entropy                  59021 non-null  bool    
 11  dqa_duplicate_rows           59021 non-null  bool    
 12  dqa_duplicate_id             59021 non-null  bool    
 13  dqa_du

1206

In [15]:
df.id.nunique()

59021

In [21]:
cols = [col for col in df.columns if col.startswith("dqa")]
s = df[cols].sum(axis=0)
s = pd.DataFrame(s, columns=["n"])
s["%"] = round(df[cols].sum(axis=0) / df.shape[0] * 100, 2)
s

Unnamed: 0,n,%
dqa_entropy,193,0.33
dqa_duplicate_rows,0,0.0
dqa_duplicate_id,0,0.0
dqa_duplicate_review,3282,5.56
dqa_has_null,0,0.0
dqa_invalid_rating,0,0.0
dqa_non_english_review,2946,4.99
dqa_non_english_app_name,1206,2.04
dqa_has_emoji,3490,5.91
dqa_excessive_special_chars,809,1.37


In [22]:
df[cols].sum()

dqa_entropy                     193
dqa_duplicate_rows                0
dqa_duplicate_id                  0
dqa_duplicate_review           3282
dqa_has_null                      0
dqa_invalid_rating                0
dqa_non_english_review         2946
dqa_non_english_app_name       1206
dqa_has_emoji                  3490
dqa_excessive_special_chars     809
dqa_invalid_date                  0
dqa_has_profanity               573
dqa_contains_email                0
dqa_contains_url                  0
dqa_contains_phone_number        26
dtype: int64