In [1]:
import nltk
from nltk.corpus.reader import CHILDESCorpusReader
from childes_mi.utils.paths import DATA_DIR, ensure_dir, CHILDES_DIR, CHILDES_DFS
import pandas as pd
from tqdm.autonotebook import tqdm
import numpy as np



### load all data

In [2]:
corpus_root = nltk.data.find(CHILDES_DIR.as_posix())

In [3]:
all_XML = list(CHILDES_DIR.glob('**/*.xml'))

In [4]:
transcripts = CHILDESCorpusReader(corpus_root, all_XML)
len(transcripts.fileids()), transcripts.fileids()[:3]

(41844,
 [PosixPath('/mnt/cube/tsainbur/Projects/github_repos/childes_mi_project/data/raw/NLTK_Data_Dir/corpora/Scandinavian/Danish/Plunkett/Anne/021111.xml'),
  PosixPath('/mnt/cube/tsainbur/Projects/github_repos/childes_mi_project/data/raw/NLTK_Data_Dir/corpora/Scandinavian/Danish/Plunkett/Anne/010019.xml'),
  PosixPath('/mnt/cube/tsainbur/Projects/github_repos/childes_mi_project/data/raw/NLTK_Data_Dir/corpora/Scandinavian/Danish/Plunkett/Anne/010307.xml')])

In [5]:
corpus_participants = transcripts.participants(transcripts.fileids())
len(corpus_participants), corpus_participants[:3]

(41844,
 [defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fe69c13eea0>, {'CHI': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fe69c13eea0>, {'id': 'CHI', 'name': 'Anne', 'role': 'Target_Child', 'language': 'dan', 'age': 'P2Y11M11D'}), 'MOT': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fe69c13eea0>, {'id': 'MOT', 'name': 'Annette', 'role': 'Mother', 'language': 'dan', 'sex': 'female', 'SES': 'UC'}), 'INV': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fe69c13eea0>, {'id': 'INV', 'name': 'Berit', 'role': 'Investigator', 'language': 'dan'}), 'CAM': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7fe69c13eea0>, {'id': 'CAM', 'name': 'Sebastian_Camera_Operator', 'role': 'Investigator', 'language': 'dan'})}), defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0

In [6]:
corpus_data = transcripts.corpus(transcripts.fileids())
len(corpus_data)

41844

### create transcript database
- for each transcript

In [7]:
tran_list = [
    [transcript.as_posix()] 
    for transcript in tqdm(all_XML)
]

HBox(children=(IntProgress(value=0, max=41844), HTML(value='')))




In [8]:
from joblib.externals.loky import set_loky_pickler
from joblib import parallel_backend
from joblib import Parallel, delayed
from joblib import wrap_non_picklable_objects
import collections


@delayed
@wrap_non_picklable_objects
def childes_get_sequence_df(transcript):
    """ get sequences of speech for each individual
    """

    transcripts = CHILDESCorpusReader(corpus_root, transcript)
    corpus_participants = transcripts.participants(transcript)[0]
    # transcript info
    transcript_dict = transcripts.corpus(transcript)[0]
    if "PID" not in transcript_dict.keys():
        PID = "None"
    else:
        PID = transcript_dict["PID"]
    lang = transcript_dict["Lang"]
    corpus = transcript_dict["Corpus"]
    date = transcript_dict["Date"]

    sequence_dfs = []
    # participant info
    for participant_id in sorted(corpus_participants.keys()):
        dct = corpus_participants[participant_id]
        pdict = corpus_participants[participant_id]
        morphemes = transcripts.sents(transcript, speaker=[participant_id], stem=True)
        words = transcripts.sents(transcript, speaker=[participant_id], stem=False)
        POS = [
            [i[1] for i in sent]
            for sent in transcripts.tagged_sents(transcript, speaker=[participant_id])
        ]

        # ensure that age and sex have values
        if type(pdict["age"]) == collections.defaultdict:
            age = np.nan
        else:
            age = pdict["age"]
        if type(pdict["sex"]) == collections.defaultdict:
            sex = np.nan
        else:
            sex = pdict["sex"]

        sequence_dfs.append(
            [
                PID,
                lang,
                corpus,
                date,
                participant_id,
                pdict["role"],
                age,
                pdict["language"],
                sex,
                list(POS),
                list(words),
                list(morphemes),
                transcript,
                len(words),
                len(morphemes),
                len(POS),
            ]
        )
    return sequence_dfs

In [9]:
nex = -1
with Parallel(n_jobs=-1) as parallel:
    seq_dfs =parallel(
                childes_get_sequence_df(transcript)
                for transcript in tqdm(tran_list[:nex])
            )
    seq_dfs = [item for sublist in seq_dfs for item in sublist]
seq_df = pd.DataFrame(seq_dfs,
        columns=[
            "PID",
            "lang",
            "corpus",
            "date",
            "participant_key",
            "role",
            "age",
            "language",
            "sex",
            "POS",
            "words",
            "morphemes",
            "transcript_xml",
            "n_words",
            "n_morphemes",
            "n_POS",
        ],
    )

HBox(children=(IntProgress(value=0, max=41843), HTML(value='')))




In [10]:
seq_df[:3]

Unnamed: 0,PID,lang,corpus,date,participant_key,role,age,language,sex,POS,words,morphemes,transcript_xml,n_words,n_morphemes,n_POS
0,11312/c-00046421-1,dan,Plunkett,1985-01-31,CAM,Investigator,,dan,,"[[, ], [, ], [, , , , , , , ], [, , , , ]]","[[gør, det], [det, er], [ellers, skal, vi, til...","[[gør, det], [det, er], [ellers, skal, vi, til...",[/mnt/cube/tsainbur/Projects/github_repos/chil...,4,4,4
1,11312/c-00046421-1,dan,Plunkett,1985-01-31,CHI,Target_Child,P2Y11M11D,dan,,"[[, , , , , , ], [, , , , ], [], [], [], [], [...","[[jeg, vil, gerne, have, noget, at, drikke], [...","[[jeg, vil, gerne, have, noget, at, drikke], [...",[/mnt/cube/tsainbur/Projects/github_repos/chil...,566,566,566
2,11312/c-00046421-1,dan,Plunkett,1985-01-31,INV,Investigator,,dan,,"[[, , , , , ], [], [], [, ], [, , , , , , , , ...","[[det, må, du, også, gerne, få], [mm], [kan], ...","[[det, må, du, også, gerne, få], [mm], [kan], ...",[/mnt/cube/tsainbur/Projects/github_repos/chil...,383,383,383


In [11]:
seq_df.to_pickle(CHILDES_DFS/'childes_df.pickle')

In [12]:
for col in seq_df.columns:
    print(
        col,
        np.unique([str(type(i)) for i in seq_df[col].values])
    )

PID ["<class 'str'>"]
lang ["<class 'str'>"]
corpus ["<class 'str'>"]
date ["<class 'str'>"]
participant_key ["<class 'str'>"]
role ["<class 'str'>"]
age ["<class 'float'>" "<class 'str'>"]
language ["<class 'str'>"]
sex ["<class 'float'>" "<class 'str'>"]
POS ["<class 'list'>"]
words ["<class 'list'>"]
morphemes ["<class 'list'>"]
transcript_xml ["<class 'list'>"]
n_words ["<class 'numpy.int64'>"]
n_morphemes ["<class 'numpy.int64'>"]
n_POS ["<class 'numpy.int64'>"]
