In [8]:
import pandas as pd
import spacy, string
from spacy.vocab import Vocab
nlp = spacy.load('de_core_news_sm')
        
class Dataset:

    # dataset initialization
    def __init__(self, path):
        
        # sets the file name i.e. type:
        self.path = path
        
        # loads input data
        self.data = nlp(open(path + '.csv').read())
        
        # initializes the vocabulary size:
        self.vocabulary_size = 0
        
        print('init for ' + self.path + ' is done')

    # returns the age group of the dataset or the file name
    def get_path(self):
        return self.path
    
    # returns the raw dataset    
    def get_data(self):
        return self.data 
    
    # returns the vocabulary size:
    def get_vocabulary_size(self):
        return self.vocabulary_size

    # CALCULATE THE NR. OF CHARACTERS IN THE INPUT DOCUMENT
    def calculate_length (self):
        total_length = 0

        for token in self.data:
            total_length = total_length + len(token.text)
    
        return total_length
    
    # CREATE VOCABULARY
    def create_vocabulary(self, save):
        bag_of_words = set()

        # create bag of words and save copy
        for token in self.data:
            if (token.text not in bag_of_words):
                bag_of_words.add(token.text)
                self.vocabulary_size += 1

        if (save):
            with open('voc_' + self.path + '.csv', 'w') as voc:
                for word in bag_of_words:
                    voc.write(word + ' ') 
                
        print('vocabulary for ' + self.path + ' is created, size: ' + str(self.vocabulary_size))
        return bag_of_words
    
    # CREATE DICTIONARY
    def create_dictionary(self, bag_of_words, save):
        # create dictionary from all words
        dictionary = dict.fromkeys(bag_of_words, 0)

        # count words in bag
        for token in self.data:
            dictionary[token.text] += 1    

        # save copy
        if (save):
            with open('dict_' + self.path + '.csv', 'w') as doc_out:
                for word, freq in dictionary.items():
                    doc_out.write(word + ': ' + str(freq) + '\n')   
                
        print('dictionary for ' + self.path + ' is created')
        return dictionary

    
    # COUNT WORD FREQUENCY
    def calc_termFrequency(self, dictionary):
        tfDict = {}
        count_in_data = len(self.data)

        for word, count_in_dict in dictionary.items():
            tfDict[word] = count_in_dict / float(count_in_data)

        print('term frequencies for ' + self.path + ' are generated')
        return tfDict     

In [3]:
# TUNERS FOR GROUP LEVEL ANALYSIS
save = True
paths = ['./IO_YO/young', './IO_YO/old']
doc_types = ['_in', '_norm']

# EXECUTION
dataset_y = Dataset(paths[0] + doc_types[1])
dataset_o = Dataset(paths[1] + doc_types[1])

#tfDict = dataset.calc_termFrequency(dict)
#dict_o = dataset_o.create_dictionary(dataset_o.create_vocabulary(save), save)
#dict_y = dataset_y.create_dictionary(dataset_y.create_vocabulary(save), save)

In [3]:
# TUNERS FOR PARTICIPANT LEVEL ANALYSIS
save = False
participant_ids = nlp(open('PARTICIPANT_ID.txt').read())
paths = ['./IO_P/' + token.text for token in participant_ids]
doc_types = ['_in', '_norm']

#EXECUTION
for path in paths:
    if path is not './IO_P/\n':
        dataset = Dataset(path + doc_types[1])
        dataset.create_vocabulary(save)
        voc_size.append(word + ': ' + str(dataset.get_vocabulary_size()) + '\n')  

init for Swiss_EAR_young is done
init for Swiss_EAR_young is done


In [None]:
"""
### TUNER
save = 'False' #NE ÁLLÍTSD ÁT, MERT 330 FÁJLT FOG ÍRNI

ids= nlp(open('PARTICIPANT_ID.txt').read())
ids = [token.text for token in ids]
doc_type = ['in', 'norm']

for word in ids:
    with open(word + 'in' + '.csv', 'w') as data_creator:
        data = nlp(open(word + '.csv').read())
        data_str = [token.text for token in data]
        for element in data_str:
            if element == '\n':
                data_creator.write(element) 
            else:
                data_creator.write(element + ' ') 
    print(word + ' done')
"""