In [2]:
import nltk
from nltk.corpus.reader import CHILDESCorpusReader
from childes_un.utils.paths import DATA_DIR, ensure_dir, CHILDES_DIR
import pandas as pd
from tqdm.autonotebook import tqdm



### load all data

In [3]:
corpus_root = nltk.data.find(CHILDES_DIR.as_posix())

In [4]:
all_XML = list(CHILDES_DIR.glob('**/*.xml'))

In [7]:
transcripts = CHILDESCorpusReader(corpus_root, all_XML)
len(transcripts.fileids()), transcripts.fileids()[:3]

(41899,
 [PosixPath('/mnt/cube/tsainbur/Projects/github_repos/childes_un_project/data/raw/NLTK_Data_Dir/corpora/XLing/MDT/turkish/t9/01f_ejpd.xml'),
  PosixPath('/mnt/cube/tsainbur/Projects/github_repos/childes_un_project/data/raw/NLTK_Data_Dir/corpora/XLing/MDT/turkish/t9/17b_ejpd.xml'),
  PosixPath('/mnt/cube/tsainbur/Projects/github_repos/childes_un_project/data/raw/NLTK_Data_Dir/corpora/XLing/MDT/turkish/t9/18b_erld.xml')])

In [8]:
corpus_participants = transcripts.participants(transcripts.fileids())
len(corpus_participants), corpus_participants[:3]

(41899,
 [defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fa619bf1c80>, {'SPE': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fa619bf1c80>, {'id': 'SPE', 'role': 'Child', 'language': 'eng'}), 'GES': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fa619bf1c80>, {'id': 'GES', 'role': 'Child', 'language': 'eng'})}), defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fa5e39c06a8>, {'SPE': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fa5e39c06a8>, {'id': 'SPE', 'role': 'Child', 'language': 'eng'}), 'GES': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fa5e39c06a8>, {'id': 'GES', 'role': 'Child', 'language': 'eng'})}), ...])

In [9]:
corpus_data = transcripts.corpus(transcripts.fileids())
len(corpus_data)

41899

### create transcript database
- for each transcript

In [34]:
sequence_df = pd.DataFrame(
    columns=[
        "PID",
        "lang", 
        "corpus",
        "date",
        "participant_key",
        "role",
        "age",
        "language",
        "ID",
        "POS",
        "words",
        "morphemes",
        "transcript_xml",
        "n_words",
        "n_morphemes",
        "n_POS",
    ]
)

['/mnt/cube/tsainbur/Projects/github_repos/childes_un_project/data/raw/NLTK_Data_Dir/corpora/XLing/GRERLI/cat/Teachers/ct09fnsd.xml']

In [38]:
PID, lang, corpus, date

('11312/c-00036615-1', 'cat', 'GRERLI', '1998-06-03')

In [43]:
pd.DataFrame({"test":['test']})

Unnamed: 0,test
0,test


In [196]:
tran_list = [
    [transcript.as_posix()] 
    for transcript in tqdm(all_XML)
]

HBox(children=(IntProgress(value=0, max=41899), HTML(value='')))

In [197]:
from joblib.externals.loky import set_loky_pickler
from joblib import parallel_backend
from joblib import Parallel, delayed
from joblib import wrap_non_picklable_objects


@delayed 
@wrap_non_picklable_objects
def childes_get_sequence_df(transcript):
    """ get sequences of speech for each individual
    """

    
    transcripts = CHILDESCorpusReader(corpus_root, transcript)
    corpus_participants = transcripts.participants(transcript)
    # transcript info
    transcript_dict = transcripts.corpus(transcript)[0]
    PID = transcript_dict['PID']
    lang = transcript_dict['Lang']
    corpus = transcript_dict['Corpus']
    date = transcript_dict['Date']
    

    
    sequence_dfs = []
    # participant info
    for participant_id in sorted(corpus_participants[0].keys()):
        dct = corpus_participants[0][participant_id]
        pdict = corpus_participants[0][participant_id]        
        morphemes = transcripts.sents(transcript,speaker=[participant_id], stem=True)
        words = transcripts.sents(transcript,speaker=[participant_id], stem=False)
        POS = [[i[1] for i in sent] for sent in transcripts.tagged_sents(transcript,speaker=[participant_id])]
    
        
        sequence_df = pd.DataFrame({
                "PID":[PID],
                "lang":[lang], 
                "corpus":[corpus],
                "date":[date],
                "participant_key":[participant_id],
                "role":[pdict['role']],
                "age":[pdict['age']],
                "participant_language":[pdict['language']],
                "sex": [pdict['sex']],
                "ID":[participant_id],
                # word info
                "POS":[POS],
                "words":[words],
                "morphemes":[morphemes],
                "transcript_xml":transcript,
                "n_words":[len(words)],
                "n_morphemes":[len(morphemes)],
                "n_POS":[len(POS)],
            })
        #display(sequence_df)
        sequence_dfs.append(sequence_df)
    return pd.concat(sequence_dfs)

In [198]:
nex = -1
with Parallel(n_jobs=1) as parallel:
    dfs = parallel(
        childes_get_sequence_df(transcript)
        for transcript in tqdm(tran_list[:nex])
    )

HBox(children=(IntProgress(value=0, max=41898), HTML(value='')))

KeyboardInterrupt: 

In [168]:
nex = -1
with Parallel(n_jobs=2) as parallel:
    dfs = parallel(
        childes_get_sequence_df(
            [transcript.as_posix()], transcripts.participants([transcript.as_posix()])
        )
        for transcript in tqdm(all_XML[:nex])
    )

HBox(children=(IntProgress(value=0, max=41898), HTML(value='')))

KeyboardInterrupt: 

In [179]:
sequence_dfs = []
for transcript in tqdm(all_XML):
    transcript = [transcript.as_posix()]
    #print(transcripts.age(transcript))
    # transcript info
    transcript_dict = transcripts.corpus(transcript)[0]
    PID = transcript_dict['PID']
    lang = transcript_dict['Lang']
    corpus = transcript_dict['Corpus']
    date = transcript_dict['Date']
    
    # participant info
    corpus_participants = transcripts.participants(transcript)
    for participant_id in sorted(corpus_participants[0].keys()):
        dct = corpus_participants[0][participant_id]
        pdict = corpus_participants[0][participant_id]
        #print(participant_id, ": ", [(k, dct[k]) for k in sorted(dct.keys())])
        
        morphemes = transcripts.sents(transcript,speaker=[participant_id], stem=True)
        words = transcripts.sents(transcript,speaker=[participant_id], stem=False)
        POS = [i[1] for i in transcripts.tagged_sents(transcript,speaker=['MON'])]
        sequence_df = pd.DataFrame({
            "PID":[PID],
            "lang":[lang], 
            "corpus":[corpus],
            "date":[date],
            "participant_key":[participant_id],
            "role":[pdict['role']],
            "age":[pdict['age']],
            "participant_language":[pdict['language']],
            "sex": [pdict['sex']],
            "ID":[participant_id],
            # word info
            "POS":[POS],
            "words":[words],
            "morphemes":[morphemes],
            "transcript_xml":transcript,
            "n_words":[len(words)],
            "n_morphemes":[len(morphemes)],
            "n_POS":[len(POS)],
        })
    sequence_dfs.append(sequence_df)
sequence_df = pd.concat(sequence_dfs)

HBox(children=(IntProgress(value=0, max=41899), HTML(value='')))

KeyboardInterrupt: 

In [178]:
transcripts.tagged_sents(transcript,speaker=[participant_id], stem=True)

[[('un', ''), ('pa', ''), ('un', ''), ('patines', '')], [('uno', ''), ('solo', '')], ...]

In [175]:
transcripts.words(transcript,speaker=[participant_id])

['un', 'pa', 'un', 'patines', 'uno', 'solo', 'y', 'y', ...]

In [None]:
TRANSC

In [64]:
print(participant_id, ": ", [(k, dct[k]) for k in sorted(dct.keys())])

SPE :  [('age', defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fa5dfec61e0>, {})), ('id', 'SPE'), ('language', 'eng'), ('role', 'Child'), ('sex', defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fa5dfec61e0>, {}))]


In [63]:
corpus_participants[0][participant_id]

defaultdict(<function nltk.corpus.reader.childes.CHILDESCorpusReader._get_participants.<locals>.dictOfDicts()>,
            {'id': 'SPE',
             'role': 'Child',
             'language': 'eng',
             'age': defaultdict(<function nltk.corpus.reader.childes.CHILDESCorpusReader._get_participants.<locals>.dictOfDicts()>,
                         {}),
             'sex': defaultdict(<function nltk.corpus.reader.childes.CHILDESCorpusReader._get_participants.<locals>.dictOfDicts()>,
                         {})})

In [62]:
pdict['age']

defaultdict(<function nltk.corpus.reader.childes.CHILDESCorpusReader._get_participants.<locals>.dictOfDicts()>,
            {})

In [60]:
sequence_df

Unnamed: 0,PID,lang,corpus,date,participant_key,role,age,participant_language,sex,ID,POS,words,morphemes,transcript_xml,n_words,n_morphemes,n_POS
0,11312/t-00004893-1,eng,MDT,1984-01-01,SPE,Child,{},eng,{},SPE,[],"(domates, adam, muzu, alip, tepeden, asagi, in...","(domates, adam, muzu, alip, tepeden, asagi, in...",/mnt/cube/tsainbur/Projects/github_repos/child...,10,10,0
0,11312/t-00005043-1,eng,MDT,1984-01-01,SPE,Child,{},eng,{},SPE,[],"(ondan, sonra, tepeden, ziplaya, ziplaya, iniyor)","(ondan, sonra, tepeden, ziplaya, ziplaya, iniyor)",/mnt/cube/tsainbur/Projects/github_repos/child...,6,6,0
0,11312/t-00005055-1,eng,MDT,1984-01-01,SPE,Child,{},eng,{},SPE,[],"(domates, adam, da, yuvarlana, yuvarlana, agac...","(domates, adam, da, yuvarlana, yuvarlana, agac...",/mnt/cube/tsainbur/Projects/github_repos/child...,7,7,0
0,11312/t-00005078-1,eng,MDT,1984-01-01,SPE,Child,{},eng,{},SPE,[],"(sonra, o, yuvarla, gidi, cikiyor, yukariya)","(sonra, o, yuvarla, gidi, cikiyor, yukariya)",/mnt/cube/tsainbur/Projects/github_repos/child...,6,6,0
0,11312/t-00005060-1,eng,MDT,1984-01-01,SPE,Child,{},eng,{},SPE,[],"(ilk, yesil, adam, done, done, yukari, cikiyor)","(ilk, yesil, adam, done, done, yukari, cikiyor)",/mnt/cube/tsainbur/Projects/github_repos/child...,7,7,0
0,11312/t-00004994-1,eng,MDT,1984-01-01,SPE,Child,{},eng,{},SPE,[],"(uzerinden, cikti, yesil, adam, da, yuksege, d...","(uzerinden, cikti, yesil, adam, da, yuksege, d...",/mnt/cube/tsainbur/Projects/github_repos/child...,8,8,0
0,11312/t-00005033-1,eng,MDT,1984-01-01,SPE,Child,{},eng,{},SPE,[],"(zipladi, zipladi, ziplayarak, asagi, indi)","(zipladi, zipladi, ziplayarak, asagi, indi)",/mnt/cube/tsainbur/Projects/github_repos/child...,5,5,0
0,11312/t-00005004-1,eng,MDT,1984-01-01,SPE,Child,{},eng,{},SPE,[],"(tepeden, yukariya, ziplayarak, cikiyor, kirmi...","(tepeden, yukariya, ziplayarak, cikiyor, kirmi...",/mnt/cube/tsainbur/Projects/github_repos/child...,7,7,0
0,11312/t-00005068-1,eng,MDT,1984-01-01,SPE,Child,{},eng,{},SPE,[],"(domates, sudan, yuvarlana, yuvarlana, cikiyor)","(domates, sudan, yuvarlana, yuvarlana, cikiyor)",/mnt/cube/tsainbur/Projects/github_repos/child...,5,5,0
0,11312/t-00004934-1,eng,MDT,1984-01-01,SPE,Child,{},eng,{},SPE,[],"(yesil, adam, ziplaya, ziplaya, yokustan, cikti)","(yesil, adam, ziplaya, ziplaya, yokustan, cikti)",/mnt/cube/tsainbur/Projects/github_repos/child...,6,6,0


In [65]:
sequence_df.iloc[0].transcript_xml

'/mnt/cube/tsainbur/Projects/github_repos/childes_un_project/data/raw/NLTK_Data_Dir/corpora/XLing/MDT/turkish/t9/01f_ejpd.xml'

In [24]:
transcripts.participants(transcript)[0].keys()

dict_keys(['MON', 'INV'])

In [None]:
len(childes.words(file, stem=True)

'Subject'

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',


In [33]:
len(transcripts.words(transcript,speaker=['MON'], stem=False))

213

In [13]:
corpus_participants

[defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fa5df38bd90>, {'VIC': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fa5df38bd90>, {'id': 'VIC', 'name': 'Víctor', 'role': 'Subject', 'language': 'cat', 'age': 'P41Y03M', 'sex': 'male', 'group': '2', 'education': 'L1 Catalan', 'custom-field': 'NS', 'birthday': '1957-03-10'}), 'INV': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fa5df38bd90>, {'id': 'INV', 'name': 'investigador', 'role': 'Investigator', 'language': 'cat'})})]

In [20]:
for participant_id in sorted(corpus_participants[0].keys()):
        dct = corpus_participants[0][participant_id]
        print(participant_id, ": ", [(k, dct[k]) for k in sorted(dct.keys())])

INV :  [('id', 'INV'), ('language', 'cat'), ('name', 'investigador'), ('role', 'Investigator')]
VIC :  [('age', 'P41Y03M'), ('birthday', '1957-03-10'), ('custom-field', 'NS'), ('education', 'L1 Catalan'), ('group', '2'), ('id', 'VIC'), ('language', 'cat'), ('name', 'Víctor'), ('role', 'Subject'), ('sex', 'male')]


In [16]:
corpus_participants[0][participant_id]['role']

'Subject'

In [46]:
this_corpus_participants

defaultdict(<function nltk.corpus.reader.childes.CHILDESCorpusReader._get_participants.<locals>.dictOfDicts()>,
            {'SPE': defaultdict(<function nltk.corpus.reader.childes.CHILDESCorpusReader._get_participants.<locals>.dictOfDicts()>,
                         {'id': 'SPE', 'role': 'Child', 'language': 'eng'}),
             'GES': defaultdict(<function nltk.corpus.reader.childes.CHILDESCorpusReader._get_participants.<locals>.dictOfDicts()>,
                         {'id': 'GES', 'role': 'Child', 'language': 'eng'})})

In [39]:
transcripts.words(transcript,speaker=['GES'])

['asagi', 'tepede', 'asagi', 'ini']

In [43]:
transcripts.age(fileids=transcript, )

KeyboardInterrupt: 

In [44]:
??transcripts.age

In [37]:
corpus_participants

[defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7f8378631a60>, {'SPE': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7f8378631a60>, {'id': 'SPE', 'role': 'Child', 'language': 'eng'}), 'GES': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7f8378631a60>, {'id': 'GES', 'role': 'Child', 'language': 'eng'})})]

In [26]:
corpus_participants

[defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7f8374d02c80>, {'INF': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7f8374d02c80>, {'id': 'INF', 'role': 'Informant', 'language': 'hun'}), 'INV': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7f8374d02c80>, {'id': 'INV', 'role': 'Investigator', 'language': 'hun'})})]

In [17]:
transcript_dict

{'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.talkbank.org/ns/talkbank https://talkbank.org/software/talkbank.xsd',
 'Media': '01f_ejpd',
 'Mediatypes': 'video',
 'PID': '11312/t-00004893-1',
 'Version': '2.10.0',
 'Lang': 'eng',
 'Corpus': 'MDT',
 'Date': '1984-01-01'}

In [19]:
PID, lang, corpus, date

('11312/t-00004893-1', 'eng', 'MDT', '1984-01-01')

GES :  [('id', 'GES'), ('language', 'eng'), ('role', 'Child')]
SPE :  [('id', 'SPE'), ('language', 'eng'), ('role', 'Child')]
GES :  [('id', 'GES'), ('language', 'eng'), ('role', 'Child')]
SPE :  [('id', 'SPE'), ('language', 'eng'), ('role', 'Child')]
