In [37]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
languages = ['afb','ail','ame','amh','ara','arz','aym','bra','bul','ces','ckb','ckt','cni','deu','evn','gup',
             'heb','ind','itl','kmr','kod','krl','lud','mag','nld','olo','pol','por','rus','sah','see','spa',
             'syc','tyv','vep']

In [141]:
def target_alphabet(df):
    tgt_alphabet = set()
    for lemma in df.infl:
        tgt_alphabet |= set(lemma)
    return tgt_alphabet

def source_alphabet(df):
    src_alphabet = set()
    for lemma in df.lemma:
        src_alphabet |= set(lemma)
    return src_alphabet

def merged_alphabet(df):
    merged_alphabet = set()
    for infl in df.infl:
        merged_alphabet |= set(infl)
    for lemma in df.lemma:
        merged_alphabet |= set(lemma)
    return merged_alphabet

def tags_combination(df):
    tags_combination = set()
    for token in df.tags:
            tags_combination.add(token)
    return tags_combination

def tags_of_df(df):
    tags = set()
    for token in df.tags:
            for tag in token.split(';'):
                tags.add(tag)
    return tags

def average_diff_between_lemma_and_infl(df):
    sum = 0
    #for i in range(len(df)):
    #    sum += abs(len(df.iloc[i].lemma)-len(df.iloc[i].infl))
    df['diff'] = df.apply(lambda x: abs(len(x.lemma)-len(x.infl)), axis=1)
    return df['diff'].sum() / len(df)

def average_lemma_len(df):
    df['lemma_len'] = df.apply(lambda x: len(x.lemma), axis=1)
    return df['lemma_len'].sum() / len(df)

def average_infl_len(df):
    df['infl_len'] = df.apply(lambda x: len(x.infl), axis=1)
    return df['infl_len'].sum() / len(df)


def analyse(datas):
    for lang in datas:
        df = pd.read_csv(lang+'.train',sep='\t', header=None, names=['lemma','infl','tags'])
        df = df.replace(np.nan, 'nan', regex=True)
        
        print(lang+':')
        print('\tLines:',len(df))
        #lines[lang] = len(df)
        #df['len'] = df.apply(lambda x : len(x.lemma), axis=1)

        print('\tTags len:', len(tags_of_df(df)))

        print('\tTags combination len:', len(tags_combination(df)))

        print('\tSource vocab len:', len(source_alphabet(df)))
        print('\tTarget vocab len:', len(target_alphabet(df)))
        print('\tMerged vocab len:', len(merged_alphabet(df)))

        print('\tUnique lemmas:', df.lemma.describe().unique()[1])
        
        #print('\tLongest lemma', max(map(len, df.lemma)))
        #print('\tShortest lemma', min(map(len, df.lemma)))

        print('\tAverage difference between lemma and infl:', average_diff_between_lemma_and_infl(df))
        
        print('\tAverage lemma len:', average_lemma_len(df))
        
        print('\tAverage infl len:', average_infl_len(df))

In [142]:
analyse(languages)

afb:
	Lines: 22165
	Tags len: 18
	Tags combination len: 68
	Source vocab len: 46
	Target vocab len: 39
	Merged vocab len: 46
	Unique lemmas: 5697
	Average difference between lemma and infl: 1.7637265959846604
	Average lemma len: 5.813850665463569
	Average infl len: 6.257207308820212
ail:
	Lines: 918
	Tags len: 60
	Tags combination len: 240
	Source vocab len: 36
	Target vocab len: 42
	Merged vocab len: 44
	Unique lemmas: 470
	Average difference between lemma and infl: 4.147058823529412
	Average lemma len: 4.533769063180828
	Average infl len: 8.639433551198257
ame:
	Lines: 2524
	Tags len: 21
	Tags combination len: 24
	Source vocab len: 26
	Target vocab len: 27
	Merged vocab len: 27
	Unique lemmas: 323
	Average difference between lemma and infl: 1.972662440570523
	Average lemma len: 8.375594294770206
	Average infl len: 9.025752773375594
amh:
	Lines: 32254
	Tags len: 32
	Tags combination len: 123
	Source vocab len: 198
	Target vocab len: 229
	Merged vocab len: 229
	Unique lemmas: 2428
	Ave

	Merged vocab len: 38
	Unique lemmas: 1410
	Average difference between lemma and infl: 6.4643663914599285
	Average lemma len: 6.671880934770006
	Average infl len: 13.136207344621473
see:
	Lines: 3801
	Tags len: 13
	Tags combination len: 39
	Source vocab len: 25
	Target vocab len: 30
	Merged vocab len: 33
	Unique lemmas: 140
	Average difference between lemma and infl: 4.574059458037358
	Average lemma len: 6.280452512496711
	Average infl len: 10.82609839515917
spa:
	Lines: 100001
	Tags len: 23
	Tags combination len: 70
	Source vocab len: 29
	Target vocab len: 34
	Merged vocab len: 34
	Unique lemmas: 2046
	Average difference between lemma and infl: 2.005949940500595
	Average lemma len: 8.44459555404446
	Average infl len: 10.24828751712483
syc:
	Lines: 1217
	Tags len: 31
	Tags combination len: 276
	Source vocab len: 23
	Target vocab len: 31
	Merged vocab len: 32
	Unique lemmas: 534
	Average difference between lemma and infl: 6.154478225143796
	Average lemma len: 4.0262941659819225
	Average