In [1]:
%env NLTK_DATA= /mnt/cube/tsainbur/Projects/github_repos/LRSO_paper_rev/LongRangeSequentialOrgPaper/data/raw/NLTK_Data_Dir/corpora

env: NLTK_DATA=/mnt/cube/tsainbur/Projects/github_repos/LRSO_paper_rev/LongRangeSequentialOrgPaper/data/raw/NLTK_Data_Dir/corpora


In [2]:
import nltk
from nltk.corpus.reader import CHILDESCorpusReader
from childes_mi.utils.paths import DATA_DIR, ensure_dir, CHILDES_DIR, CHILDES_DFS
import pandas as pd
from tqdm.autonotebook import tqdm
import numpy as np

  from tqdm.autonotebook import tqdm


### load all data

In [3]:
CHILDES_DIR.as_posix()

'/mnt/cube/tsainbur/Projects/github_repos/LRSO_paper_rev/LongRangeSequentialOrgPaper/data/raw/NLTK_Data_Dir/corpora'

In [4]:
corpus_root = nltk.data.find(CHILDES_DIR.as_posix())

In [5]:
all_XML = list(CHILDES_DIR.glob('**/*.xml'))

In [6]:
transcripts = CHILDESCorpusReader(corpus_root, all_XML)
len(transcripts.fileids()), transcripts.fileids()[:3]

(41844,
 [PosixPath('/mnt/cube/tsainbur/Projects/github_repos/LRSO_paper_rev/LongRangeSequentialOrgPaper/data/raw/NLTK_Data_Dir/corpora/Slavic/Croatian/Kovacevic/marjon/010503.xml'),
  PosixPath('/mnt/cube/tsainbur/Projects/github_repos/LRSO_paper_rev/LongRangeSequentialOrgPaper/data/raw/NLTK_Data_Dir/corpora/Slavic/Croatian/Kovacevic/marjon/010614.xml'),
  PosixPath('/mnt/cube/tsainbur/Projects/github_repos/LRSO_paper_rev/LongRangeSequentialOrgPaper/data/raw/NLTK_Data_Dir/corpora/Slavic/Croatian/Kovacevic/marjon/010628.xml')])

In [7]:
corpus_participants = transcripts.participants(transcripts.fileids())
len(corpus_participants), corpus_participants[:3]

(41844,
 [defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7f7205736a60>, {'CHI': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7f7205736a60>, {'id': 'CHI', 'name': 'Marina', 'role': 'Target_Child', 'language': 'hrv', 'age': 'P1Y05M03D', 'sex': 'male'}), 'SAN': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7f7205736a60>, {'id': 'SAN', 'name': 'Sandra', 'role': 'Mother', 'language': 'hrv'}), 'TAT': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7f7205736a60>, {'id': 'TAT', 'name': 'Tata', 'role': 'Father', 'language': 'hrv'})}), defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7f7205736c10>, {'CHI': defaultdict(<function CHILDESCorpusReader._get_participants.<locals>.dictOfDicts at 0x7f7205736c10>, {'id': 'CHI', 'name': 'Marina', 'role': 'Target_Child', 'language': 'hrv', 'age': 'P1Y06M14D', 'sex'

In [8]:
corpus_data = transcripts.corpus(transcripts.fileids())
len(corpus_data)

41844

### create transcript database
- for each transcript

In [9]:
tran_list = [
    [transcript.as_posix()] 
    for transcript in tqdm(all_XML)
]

  0%|          | 0/41844 [00:00<?, ?it/s]

In [10]:
from joblib.externals.loky import set_loky_pickler
from joblib import parallel_backend
from joblib import Parallel, delayed
from joblib import wrap_non_picklable_objects
import collections


@delayed
@wrap_non_picklable_objects
def childes_get_sequence_df(transcript):
    """ get sequences of speech for each individual
    """

    transcripts = CHILDESCorpusReader(corpus_root, transcript)
    corpus_participants = transcripts.participants(transcript)[0]
    # transcript info
    transcript_dict = transcripts.corpus(transcript)[0]
    if "PID" not in transcript_dict.keys():
        PID = "None"
    else:
        PID = transcript_dict["PID"]
    lang = transcript_dict["Lang"]
    corpus = transcript_dict["Corpus"]
    date = transcript_dict["Date"]

    sequence_dfs = []
    # participant info
    for participant_id in sorted(corpus_participants.keys()):
        dct = corpus_participants[participant_id]
        pdict = corpus_participants[participant_id]
        morphemes = transcripts.sents(transcript, speaker=[participant_id], stem=True)
        words = transcripts.sents(transcript, speaker=[participant_id], stem=False)
        POS = [
            [i[1] for i in sent]
            for sent in transcripts.tagged_sents(transcript, speaker=[participant_id])
        ]

        # ensure that age and sex have values
        if type(pdict["age"]) == collections.defaultdict:
            age = np.nan
        else:
            age = pdict["age"]
        if type(pdict["sex"]) == collections.defaultdict:
            sex = np.nan
        else:
            sex = pdict["sex"]

        sequence_dfs.append(
            [
                PID,
                lang,
                corpus,
                date,
                participant_id,
                pdict["role"],
                age,
                pdict["language"],
                sex,
                list(POS),
                list(words),
                list(morphemes),
                transcript,
                len(words),
                len(morphemes),
                len(POS),
            ]
        )
    return sequence_dfs

In [None]:
nex = -1
with Parallel(n_jobs=-1) as parallel:
    seq_dfs =parallel(
                childes_get_sequence_df(transcript)
                for transcript in tqdm(tran_list[:nex])
            )
    seq_dfs = [item for sublist in seq_dfs for item in sublist]
seq_df = pd.DataFrame(seq_dfs,
        columns=[
            "PID",
            "lang",
            "corpus",
            "date",
            "participant_key",
            "role",
            "age",
            "language",
            "sex",
            "POS",
            "words",
            "morphemes",
            "transcript_xml",
            "n_words",
            "n_morphemes",
            "n_POS",
        ],
    )

  0%|          | 0/41843 [00:00<?, ?it/s]

In [15]:
seq_df[:3]

Unnamed: 0,PID,lang,corpus,date,participant_key,role,age,language,sex,POS,words,morphemes,transcript_xml,n_words,n_morphemes,n_POS
0,11312/c-00032871-1,hrv,Kovacevic,1994-03-27,CHI,Target_Child,P1Y05M03D,hrv,male,"[[], [], [], [], [], [], [], [], [, ], [, , ],...","[[dadaj], [ahadada], [dadad], [mama], [deda], ...","[[dadaj], [ahadada], [dadad], [mama], [deda], ...",[/mnt/cube/tsainbur/Projects/github_repos/LRSO...,166,166,166
1,11312/c-00032871-1,hrv,Kovacevic,1994-03-27,SAN,Mother,,hrv,,"[[, ], [, , , , , , ], [, , , , , ], [, , , ],...","[[evo, ga], [kud, ideš, ti, sad, kud, idemo, m...","[[evo, ga], [kud, ideš, ti, sad, kud, idemo, m...",[/mnt/cube/tsainbur/Projects/github_repos/LRSO...,226,226,226
2,11312/c-00032871-1,hrv,Kovacevic,1994-03-27,TAT,Father,,hrv,,"[[, , , ], [], [, , ], [, , , , , , , ], [, ],...","[[ma, što, pričaš, ti], [koga], [jesi, dedu, z...","[[ma, što, pričaš, ti], [koga], [jesi, dedu, z...",[/mnt/cube/tsainbur/Projects/github_repos/LRSO...,37,37,37


In [16]:
seq_df.to_pickle(CHILDES_DFS/'childes_df_full.pickle')

In [21]:
!ls -lht /mnt/cube/tsainbur/Projects/github_repos/LRSO_paper_rev/LongRangeSequentialOrgPaper/data/processed/childes/

total 1.2G
-rw-r--r-- 1 tsainbur psych-gentnerlab-users 1.4G Feb 17 21:16 childes_df_full.pickle
-rw-r----- 1 tsainbur psych-gentnerlab-users  30M Apr  7  2020 childes_df_subset.pickle
-rw-r----- 1 tsainbur psych-gentnerlab-users 1.4G Apr  7  2020 childes_df.pickle


In [17]:
for col in seq_df.columns:
    print(
        col,
        np.unique([str(type(i)) for i in seq_df[col].values])
    )

PID ["<class 'str'>"]
lang ["<class 'str'>"]
corpus ["<class 'str'>"]
date ["<class 'str'>"]
participant_key ["<class 'str'>"]
role ["<class 'str'>"]
age ["<class 'float'>" "<class 'str'>"]
language ["<class 'str'>"]
sex ["<class 'float'>" "<class 'str'>"]
POS ["<class 'list'>"]
words ["<class 'list'>"]
morphemes ["<class 'list'>"]
transcript_xml ["<class 'list'>"]
n_words ["<class 'numpy.int64'>"]
n_morphemes ["<class 'numpy.int64'>"]
n_POS ["<class 'numpy.int64'>"]


In [35]:
from childes_mi.information_theory.emi._nij_op_cython import nij_op_cython

ModuleNotFoundError: No module named 'childes_mi.information_theory.emi._nij_op_cython'

In [37]:
from childes_mi.information_theory.emi import _nij_op_cython

ImportError: cannot import name '_nij_op_cython' from 'childes_mi.information_theory.emi' (unknown location)

In [38]:
from childes_mi.information_theory import mutual_information as mi

ModuleNotFoundError: No module named 'childes_mi.information_theory.emi._nij_op_cython'

In [27]:
emi._nij_op_cython

AttributeError: module 'childes_mi.information_theory.emi' has no attribute '_nij_op_cython'