In [1]:
%matplotlib widget
import os
import sys
from re import sub
import numpy as np
from collections import defaultdict, Counter
from random import sample
from itertools import chain
import pandas as pd
import sklearn
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 6]
from pandas import DataFrame

from sklearn import metrics

In [2]:
directory="../outputs/BPEresults_productivity_corpusPBCtok_0_200_1/tsv/"  #tsv directory 
output="../outputs/BPEresults_productivity_corpusPBCtok_0_200_1/corpusPBCtok200_vectors.csv"                  #for storing the resulting centered and scaled datapoints

# BPE SPACE

In [3]:
all_files = os.listdir(directory)

df_avg = pd.DataFrame(columns = ['file', 'prod', 'cum_freq', 'idiosincracy_index'])

for n in all_files:
	inputcorpus=directory+n
	data = pd.read_csv(inputcorpus,  sep='\t')
	#print ("Processing"+n)

	fileparts=n.split(".")
	isocode=fileparts[0].split("-")

	x = data['prod'].mean()
	y = data['cum_freq'].mean()
	z = data['idiosincracy_index'].mean()

	df_aux = pd.DataFrame([[isocode[0], x,y,z]], columns=['file', 'prod', 'cum_freq', 'idiosincracy_index'])
	df_avg=df_avg.append(df_aux)

#print(df_avg.describe())
df_isocodes=df_avg.iloc[:, 0:1] 
df_isocodes=df_isocodes.reset_index(drop=True)


df_avg.columns = range(df_avg.shape[1]) #remove header
df_avg = df_avg.iloc[: , 1:]  #remove first column

# perform a robust scaler transform of the dataset
trans = StandardScaler()
data = trans.fit_transform(df_avg)
# convert the array back to a dataframe
df_data= DataFrame(data)
df_avg_transformed=df_isocodes.join(df_data)
df_avg_transformed.rename(columns={0:'prod',
                          1:'cum_freq',
                          2:'idiosincracy_index'}, 
                 inplace=True)


  
df_avg_transformed.to_csv(output, index=False,)

fig = plt.figure(figsize=(13.6, 13.6))
ax = fig.add_subplot(111, projection='3d')

xs = df_avg_transformed['prod']
ys = df_avg_transformed['cum_freq']
zs = df_avg_transformed['idiosincracy_index']
labels=df_avg_transformed['file']

ax.set_xlabel('|W| Productivity',fontdict=dict(weight='bold'))
ax.set_ylabel('C. freq',fontdict=dict(weight='bold'))
ax.set_zlabel('Idiosyncrasy',fontdict=dict(weight='bold'))
#ax.set_xlim(-3, 3)
#ax.set_ylim(-2.5, 2.5)
#ax.set_zlim(-3,3)
#plt.zlim([-3, 3])
img=ax.scatter(xs, ys, zs, c=xs, s=50, alpha=1, edgecolors='w', cmap="jet")# cmap='jet')

#for x, y, z, label in zip(xs, ys, zs, labels):
#    ax.text(x, y, z, label,weight="book", size="large")

######FIX for text colors:
colors=xs.tolist()
for x, y, z, label, color in zip(xs, ys, zs, labels, colors):
    if (color<=-1):
        ax.text(x, y, z, label,weight="book", size="large", color="#00009b", fontweight="medium")
    if (color>-1 and color<-0.4):
        ax.text(x, y, z, label,weight="book", size="large",  color="#0000cd", fontweight="medium")       
    if (color>=-0.4 and color<0):
        ax.text(x, y, z, label,weight="book", size="large",  color="#007dff", fontweight="medium")
    if (color>=0 and color<0.2):
        ax.text(x, y, z, label,weight="book", size="large",  color="#00b9ff", fontweight="medium")#, backgroundcolor="white", alpha=0.1)
    if (color>=0.2 and color<0.4):
        ax.text(x, y, z, label,weight="book", size="large",  color="#02e8f4", fontweight="medium")#, backgroundcolor="white", alpha=0.1)
    if (color>=0.4 and color<0.6):
        ax.text(x, y, z, label,weight="book", size="large",  color="#23ffd4", fontweight="medium")#, backgroundcolor="white", alpha=0.1)  
    if (color>=0.6 and color<1.1):
        ax.text(x, y, z, label,weight="book", size="large",  color="#5fffaa", fontweight="medium")#, backgroundcolor="white", alpha=0.1)
        
    if (color>=1.1 and color<2):
        ax.text(x, y, z, label,weight="book", size="large",  color="#baff3c", fontweight="medium")#, backgroundcolor="white", alpha=0.1)
        
    if (color>=2 and color<3):
        ax.text(x, y, z, label,weight="book", size="large",  color="#ff4300", fontweight="medium")#, backgroundcolor="white", alpha=0.1)
        
    if (color>3):
        ax.text(x, y, z, label,weight="book", size="large",  color="#800000", fontweight="medium")#, backgroundcolor="white", alpha=0.1)
        
            
#fig.colorbar(img)
cbar=plt.colorbar(img, fraction=0.026, pad=0.04)
plt.savefig("../outputs/BPEresults_productivity_corpusPBCtok_0_200_1/BPE_space.png")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# WALS

In [4]:
wals_lang=pd.read_csv('WALS/languages.csv',index_col=1)
wals_values=pd.read_csv('WALS/values.csv',index_col=0)

bpe_vectors=pd.read_csv(output,index_col=0).sort_index()

In [5]:
#We obtain the isocodes and extract from wals info only for those languages ( they are part of the Samples_100)
isocodes=bpe_vectors.index.tolist()
wals_lang_filtered=wals_lang.query('ISO_codes in @isocodes')
wals_lang_filtered=wals_lang_filtered.loc[wals_lang_filtered['Samples_100'] == True]
wals_lang_filtered=wals_lang_filtered.drop(['Samples_100','Samples_200'], 1)
#wals_lang_filtered.to_csv("extended_info.csv", index=False,)
wals_lang_filtered
#Kew and Jac missing, original csv  had to be modified

featuresofinterest=["20A", "22A", "26A", "27A", "28A", "29A", "30A", "33A", "34A", "37A", "38A", "49A", "51A", "57A", "59A", "65A", "66A", "67A", "69A", "70A", "73A", "74A", "75A", "78A", "94A", "101A", "102A", "111A", "112A"]
walscodes=wals_lang_filtered["ID"].tolist()

all_features=bpe_vectors
for feature in featuresofinterest:

    
    wals_values_filtered=wals_values.query('Language_ID in @walscodes')
    wals_values_filtered=wals_values_filtered.loc[wals_values_filtered['Parameter_ID'] == feature]
    wals_values_filtered
    #renaming and merging:
    wals_lang_filtered=wals_lang_filtered.rename(columns={"ID": "Language_ID"})
    wals_lang_filtered

    aux1=wals_values_filtered.merge(wals_lang_filtered, on="Language_ID")
    aux1=aux1.drop(['Comment','Source_x','Example_ID','Source_y',], 1)
    aux1=aux1.rename(columns={"ISO_codes": "file"})
    aux1=aux1.set_index('file')
    aux1

    all_values=bpe_vectors.join(aux1) #What we need for each feature
    
    all_features=all_features.join(aux1['Value'])
    all_features=all_features.rename(columns={"Value": feature}) #all features together


In [6]:
some_features=all_features[["20A", "49A", "69A", "70A", "112A", "22A", "26A", "28A", "29A", "59A", "65A", "66A", "67A", "78A", "102A"]]  #Adding 20A
#substitute nan by zero:
some_features=some_features.fillna(0)

CLUSTERING

In [7]:
caption="normal"
dataset=some_features
clusters=4
#Center and scaling:
trans = StandardScaler()
data = trans.fit_transform(dataset)
##################Kclustering:#######################################3
#km = sklearn.cluster.KMeans(n_clusters=clusters, init='k-means++', algorithm="full")
km = sklearn.cluster.KMeans(n_clusters=clusters, random_state=173, init='k-means++', algorithm="full")  #299
#km=sklearn.cluster.AgglomerativeClustering(n_clusters=clusters)
#km=sklearn.cluster.SpectralClustering(n_clusters=clusters, assign_labels='discretize', random_state=0)
#km = sklearn.cluster.AffinityPropagation(preference=-50, random_state=0)
km.fit(data)

# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
results = pd.DataFrame([dataset.index,labels]).T  

#Join the clustering with the info for each language
#clustered=pd.concat([dataset, results],  axis=1)

results=results.rename(columns={1: "cluster", 0:"file"})
results=results.set_index("file")
#results.to_csv("features"+caption+"clustering_k"+str(clusters)+".csv", index=True,)

######################Plotting:###############################################################
import matplotlib.colors

colors_custom=[ "#8000FF","aqua","#FDD835","green"]

cmap= matplotlib.colors.ListedColormap(colors_custom)
cmap.set_under("crimson")
cmap.set_over("w")

colors=results["cluster"].tolist() #converting nan to zeros
labels=results.index.tolist() #languages

fig = plt.figure(figsize=(13, 13))
ax = fig.add_subplot(111, projection='3d')

xs = all_values['prod']
ys = all_values['cum_freq']
zs = all_values['idiosincracy_index']


ax.set_xlabel('|W| Productivity',fontdict=dict(weight='bold'))
ax.set_ylabel('C. freq',fontdict=dict(weight='bold'))
ax.set_zlabel('Idiosyncrasy',fontdict=dict(weight='bold'))

img=ax.scatter(xs, ys, zs, s=50, alpha=0.6, edgecolors='w', c=colors,cmap=cmap)  #cmap gist_rainbow"
ax.set_title("Clustering WALS based "+ caption)
for x, y, z, label, color in zip(xs, ys, zs, labels, colors):
    if (color==0):
        ax.text(x, y, z, label,weight="book", size="large", color="#8000FF", fontweight="medium")
    if (color==1):
        ax.text(x, y, z, label,weight="book", size="large",  color="aqua", fontweight="medium")       
    if (color==2):
        ax.text(x, y, z, label,weight="book", size="large",  color="#FDD835", fontweight="medium")
    if (color==3):
        ax.text(x, y, z, label,weight="book", size="large",  color="green", fontweight="medium")#, backgroundcolor="white", alpha=0.1)
cbar=fig.colorbar(img, fraction=0.026, pad=0.04, ticks=np.linspace(0,clusters-1,clusters))  #discrete colorbar
tick_font_size = 15
#cbar.ax.tick_params(labelsize=tick_font_size)


plt.savefig("../outputs/BPEresults_productivity_corpusPBCtok_0_200_1/WALSClustering.png")


####Silhouette score
X = all_values[['prod', 'cum_freq', 'idiosincracy_index']]
S=metrics.silhouette_score(X, km.labels_, metric="sqeuclidean")   #braycurtis
#S=metrics.silhouette_score(data, km.labels_, metric="sqeuclidean")   #braycurtis
print(S)

#[‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] 

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

0.11819002209656738
