# Text Quality Assessment

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from discover.container import DiscoverContainer
from discover.assets.idgen import AssetIDGen
from discover.core.flow import DataPrepStageDef, PhaseDef

pd.options.display.max_rows = 999

In [2]:
PPL_FULL = 4.673929214477539
FP_WEIGHTS = "notes/tqa_weights.csv"

In [3]:
container = DiscoverContainer()
container.init_resources()
container.wire(
    modules=[
        "discover.flow.data_prep.stage",
        "discover.app.base",
    ],
)

## Compute Weights

In [4]:
tqa_weights = pd.read_csv(FP_WEIGHTS)
tqa_weights

Unnamed: 0.1,Unnamed: 0,Filter,Perplexity
0,0,tqf_has_adjective,4.546495
1,1,tqf_has_adverb,4.583549
2,2,tqf_has_determiner,4.462878
3,3,tqf_has_noun,4.512285
4,4,tqf_has_terminal_punctuation,4.651699
5,5,tqf_has_verb,4.389349
6,6,tqf_high_punctuation_ratio,5.892704
7,7,tqf_word_count_range,4.521953
8,8,tqf_stop_word_match,4.549662
9,9,tqf_first_letter_cap,4.761631


In [None]:
tqa_weights["Weight"] = tqa_weights["Perplexity"].apply(
    lambda x: max(0.0, ((PPL_FULL - x) / PPL_FULL))
)
tqa_weights

Unnamed: 0.1,Unnamed: 0,Filter,Perplexity,Weight
0,0,tqf_has_adjective,4.546495,0.027265
1,1,tqf_has_adverb,4.583549,0.019337
2,2,tqf_has_determiner,4.462878,0.045155
3,3,tqf_has_noun,4.512285,0.034584
4,4,tqf_has_terminal_punctuation,4.651699,0.004756
5,5,tqf_has_verb,4.389349,0.060887
6,6,tqf_high_punctuation_ratio,5.892704,0.0
7,7,tqf_word_count_range,4.521953,0.032516
8,8,tqf_stop_word_match,4.549662,0.026587
9,9,tqf_first_letter_cap,4.761631,0.0


## Load Dataset

In [None]:
idgen = AssetIDGen()
asset_id = idgen.get_asset_id(
    asset_type="dataset",
    phase=PhaseDef.DATAPREP,
    stage=DataPrepStageDef.TQA,
    name="review",
)
# Instantiate the repository
repo = container.repo.dataset_repo()
# Load the dataset from the repository
dataset = repo.get(asset_id, distributed=False)
# Filter tqa_score outliers
df = dataset.content.loc[dataset.content["tqa_score"] < 5]
print(f"Dataset has {len(df)} observations.")

Dataset has 55691 observations.


## Get Filter Columns

In [7]:
filters = df.columns[df.columns.str.contains("tqf")].tolist()
filters = [filter for filter in filters if filter != "tqf_high_digit_ratio"]
filters

['tqf_has_adjective',
 'tqf_has_adverb',
 'tqf_has_determiner',
 'tqf_has_noun',
 'tqf_has_terminal_punctuation',
 'tqf_has_verb',
 'tqf_high_punctuation_ratio',
 'tqf_word_count_range',
 'tqf_stop_word_match',
 'tqf_first_letter_cap',
 'tqf_no_all_caps',
 'tqf_high_word_repetition',
 'tqf_no_special_chars']

## Compute Scores

In [None]:
flags = df[filters].to_numpy()
np.shape(flags)
scores = flags.dot(tqa_weights["Weight"])
df["tqa_scores2"] = scores

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tqa_scores2'] = scores


In [9]:
df.head().T

Unnamed: 0,0,1,2,3,4
app_id,521227008,466446054,1492654676,1248062869,1076402606
id,6820417815,6821034342,6824778204,6826980171,6828395114
app_name,Audiobooks.com: Get audiobooks,cloudLibrary by bibliotheca,POCKET COMICS: Premium Webtoon,Lure: Interactive Chat Stories,"Libby, by OverDrive"
category_id,6018,6018,6018,6018,6018
author,d1e8eb407847e0422c7b,d2b8d2519dbd0a3d9e76,92be615c4beec31b0db6,62b165e509cd67d2b2f3,6ca53a438d4c9ff67a79
rating,4,5,3,1,5
content,Ive been looking to get listen to audiobooks a...,The interface isnt as slick as kindle or audib...,I love the app and the fact that on good stori...,Hi the app is amazing and stories are really g...,I love how easy it is to borrow books using th...
vote_sum,0,0,0,0,0
vote_count,0,0,0,0,0
date,2021-01-02 04:17:00,2021-01-02 07:47:00,2021-01-03 05:46:00,2021-01-03 18:37:00,2021-01-04 03:45:00


## Correlation Between Scores

In [None]:
df.loc[df["stats_special_chars_proportion"] > 0.2][
    ["content", "stats_special_chars_proportion"]
]

Unnamed: 0,content,stats_special_chars_proportion
424,oh.... our table.... is broken!,0.28125
1407,Great book!!!,0.230769
1641,I LOVE DIS APP!!!!,0.222222
1642,Let us buy books on Amazon!!!!!!!!!!,0.277778
1955,Great!!,0.285714
3057,Always clear.... Always works!!!,0.21875
4299,I dont like when my time disappears!!!!!!!!!!!,0.23913
5174,Love it!!,0.222222
6290,Love it!!!,0.3
6492,AWESOME!!,0.222222


In [None]:
s1 = df["tqa_score"]
s2 = df["tqa_scores2"]
print(f"The correlation between s1 and s2 is: {s1.corr(s2)}")
print(s1.describe())
s2.describe()

The correlation between s1 and s2 is: 0.7063475218195396
count    55691.000000
mean         0.483371
std          0.101457
min          0.000000
25%          0.449221
50%          0.503377
75%          0.544764
max          2.631612
Name: tqa_score, dtype: float64


count    55691.000000
mean         0.205710
std          0.073945
min          0.000000
25%          0.173911
50%          0.231750
75%          0.251087
max          0.287609
Name: tqa_scores2, dtype: float64