In [4]:
import scipy as sp
import numpy as np
import pandas as pd
import json
import os
import glob
from datasets import load_dataset
from tqdm import tqdm
#from english_words import english_words_set
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zipf
from scipy.optimize import curve_fit
from nltk.corpus import words
import vocab_utils as utils_
from nltk import FreqDist, word_tokenize, wordpunct_tokenize
#import wiki_dump_parser as parser

In [10]:
vocab = pd.read_csv('CHILDES_vocab_age.csv')
vocab = vocab.dropna()

#
age_discrete = []
for age in vocab.loc[:, 'age']:
    age_lb = 0
    for age_ub in range(7):
        if age_lb < age <= age_ub:
            age_discrete.append(age_ub)
        age_lb = age_ub

#
vocab['age_discrete'] = age_discrete
print(vocab.groupby('age_discrete').count())

#
vocab_of_interest = [i.lower() for i in vocab.word.unique().tolist()]
english_vocab = pd.read_csv('wikipedia_vocab_regex_based.csv').iloc[:, 1].values.tolist()#list(set(words.words()).union(set(vocab_of_interest)))
#english_vocab = list(set(words.words()).union(set(vocab_of_interest)))

#
age2words = {i: [] for i in vocab.loc[:, 'age_discrete'].unique().tolist()}
for age in vocab.loc[:, 'age_discrete'].unique():
    v_ = vocab.loc[vocab.loc[:, 'age_discrete'] == age, :].loc[:, 'word'].unique().tolist()
    age2words[age] += v_

#
print(f'words in vocab of interest: {len(vocab_of_interest)}')
print(f'words in english vocab: {len(english_vocab)}')#


#
df = pd.read_csv('simplewiki-20220901-pages-articles-multistream.csv', quotechar='|', index_col = False)
df['timestamp'] = pd.to_datetime(df['timestamp'],format='%Y-%m-%dT%H:%M:%SZ')

              Unnamed: 0  word   age  raw_age
age_discrete                                 
1                   2261  2261  2261     2261
2                   4348  4348  4348     4348
3                   2855  2855  2855     2855
4                   1181  1181  1181     1181
5                    396   396   396      396
6                     54    54    54       54
words in vocab of interest: 11095
words in english vocab: 2785345


In [3]:
with open('./ALL_FILTERED_DATA/processed_data.json', 'r') as f:
    data = json.load(f)

In [None]:
# age-bins -> length of sentece
# length -> distribution of age
# oov words?
# age -> occurence of oov words?


def binnify_age(list_age):
    bins = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 100]
    bin_age = np.digitize(my_list,bins)
    
    return bin_age

In [20]:
age2len = {}
len2age = {}
age2oov = {}
len2oov = {}
source2len = {}
source2oov = {}
source2age = {}
for k_, v_ in tqdm(data.items()):
    len_ = int(v_['SENTENCE_LENGTH'])
    len_ = min(max(4, len_), 20)
    age_ = v_['AVG_AGE']
    oov_ = v_['OOV_WORDS']
    source_ = v_['DATA_SOURCE']
    
    #
    if len_ in len2age:
        len2age[len_] += [age_]
    else:
        len2age[len_] = [age_]
    
    #
    if len_ in len2oov:
        len2oov[len_] += [len(oov_)]
    else:
        len2oov[len_] = [len(oov_)]
        
    #
    if source_ in source2oov:
        source2oov[source_] += [len(oov_)]
    else:
        source2oov[source_] = [len(oov_)]
        
    
    #
    if source_ in source2age:
        source2age[source_] += [age_]
    else:
        source2age[source_] = [age_]

100%|███████████████████████████████████████████████████████████████████████████████████████| 7006061/7006061 [00:16<00:00, 428408.59it/s]


In [26]:
for source in source2age:
    dist = np.nanpercentile(source2age[source], [0, 12, 50, 75, 100])
    print(f'{source} distribution of age: {dist}')

cbt distribution of age: [0.71428571 1.         1.125      1.22727273 2.5       ]
simplified distribution of age: [0.         0.83333333 1.22222222 1.4        3.        ]
bookcorpus distribution of age: [0.         1.         1.11764706 1.23076923 3.5       ]
wikipedia distribution of age: [0.         1.         1.28571429 1.5        3.66666667]


In [27]:
for source in source2oov:
    dist = np.nanpercentile(source2oov[source], [0, 12, 50, 75, 100])
    print(f'{source} distribution of oov: {dist}')

cbt distribution of oov: [0. 0. 0. 0. 2.]
simplified distribution of oov: [  0.   0.   0.   1. 101.]
bookcorpus distribution of oov: [ 0.  0.  0.  0. 42.]
wikipedia distribution of oov: [  0.   0.   0.   0. 218.]


In [28]:
for len_ in len2age:
    dist = np.nanpercentile(len2age[len_], [0, 12, 50, 75, 100])
    print(f'{len_} distribution of age: {dist}')

17 distribution of age: [0.         1.05882353 1.17647059 1.29411765 2.29411765]
4 distribution of age: [0.         0.5        1.         1.25       3.66666667]
8 distribution of age: [0.    1.    1.125 1.25  3.   ]
10 distribution of age: [0.  1.  1.1 1.3 3. ]
6 distribution of age: [0.         1.         1.16666667 1.33333333 3.5       ]
15 distribution of age: [0.         1.         1.13333333 1.26666667 2.53333333]
13 distribution of age: [0.         1.         1.15384615 1.30769231 2.53846154]
7 distribution of age: [0.         1.         1.14285714 1.28571429 3.42857143]
16 distribution of age: [0.     1.     1.1875 1.25   2.5   ]
14 distribution of age: [0.         1.         1.14285714 1.28571429 2.5       ]
9 distribution of age: [0.         1.         1.11111111 1.22222222 3.        ]
12 distribution of age: [0.         1.         1.16666667 1.25       2.58333333]
18 distribution of age: [0.         1.05555556 1.16666667 1.27777778 2.27777778]
20 distribution of age: [0.     

In [29]:
!pwd

/home/vdeshpande/vocabulary_analysis/data_filtering/based_on_aochild
