In [1]:
from typing import List, Set
import Constants as Const
import DataLoader
import spacy
import ModelLoader
from tqdm import tqdm

nlp = ModelLoader.get_spacy_model()

firefox_issue_texts = DataLoader.load_df_compressed(Const.FIREFOX_ISSUES_ANDROID)['summary'].tolist()
vlc_issue_texts = DataLoader.load_df_compressed(Const.VLC_ISSUES)['summary'].tolist()
signal_issue_texts = DataLoader.load_df_compressed(Const.SIGNAL_ISSUES)['title'].tolist()
nextcloud_issue_texts = DataLoader.load_df_compressed(Const.NEXTCLOUD_ISSUES)['title'].tolist()

reviews = DataLoader.load_df_compressed(Const.APP_REVIEWS)

firefox_reviews = reviews[reviews['app_id']==Const.FIREFOX_ID]['content'].tolist()
vlc_reviews = reviews[reviews['app_id']==Const.VLC_ID]['content'].tolist()
signal_reviews = reviews[reviews['app_id']==Const.SIGNAL_ID]['content'].tolist()
nextcloud_reviews = reviews[reviews['app_id']==Const.NEXTCLOUD_ID]['content'].tolist()

In [2]:
def text_to_nouns(texts: List[str]) -> Set[str]:
    nouns = set()
    for text in tqdm(texts):
        doc = nlp(text)
        for ent in doc:
            if ent.pos_ == 'NOUN':
                nouns.add(ent.text.lower())
    return nouns

firefox_issue_texts_nouns = text_to_nouns(firefox_issue_texts)
vlc_issue_texts_nouns = text_to_nouns(vlc_issue_texts)
signal_issue_texts_nouns = text_to_nouns(signal_issue_texts)
nextcloud_issue_texts_nouns = text_to_nouns(nextcloud_issue_texts)

firefox_reviews_nouns = text_to_nouns(firefox_reviews)
vlc_reviews_nouns = text_to_nouns(vlc_reviews)
signal_reviews_nouns = text_to_nouns(signal_reviews)
nextcloud_reviews_nouns = text_to_nouns(nextcloud_reviews)


100%|██████████| 29941/29941 [02:26<00:00, 204.22it/s]
100%|██████████| 553/553 [00:02<00:00, 212.41it/s]
100%|██████████| 7768/7768 [00:36<00:00, 214.48it/s]
100%|██████████| 2462/2462 [00:12<00:00, 202.51it/s]
100%|██████████| 5706/5706 [00:46<00:00, 124.01it/s]
100%|██████████| 5026/5026 [00:38<00:00, 129.97it/s]
100%|██████████| 10000/10000 [01:12<00:00, 138.18it/s]
100%|██████████| 774/774 [00:05<00:00, 150.70it/s]


In [15]:
def calculate_ratio(review_nouns, issue_nouns):
    total = len(review_nouns.union(issue_nouns))
    intersec = len(review_nouns.intersection(issue_nouns))
    return intersec/total

firefox_ratio = calculate_ratio(firefox_reviews_nouns, firefox_issue_texts_nouns)
vlc_ratio = calculate_ratio(vlc_reviews_nouns, vlc_issue_texts_nouns)
signal_ratio = calculate_ratio(signal_reviews_nouns, signal_issue_texts_nouns)
nextcloud_ratio = calculate_ratio(nextcloud_reviews_nouns, nextcloud_issue_texts_nouns)

values = {
    'Firefox': firefox_ratio,
    'VLC': vlc_ratio,
    'Signal': signal_ratio,
    'Nextcloud': nextcloud_ratio
}

print('Ratio of intersection/total vocab')
for app, ratio in values.items():
    print(f'{app:<10}: {ratio:.2f}')


Firefox   : 0.19
VLC       : 0.11
Signal    : 0.24
Nextcloud : 0.25
