In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from finch import FINCH
from joblib import Parallel, delayed
import multiprocessing as mp
from copy import deepcopy
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import sklearn.metrics as skm
from sklearn.cluster import *
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import * #DotProduct, WhiteKernel
from sklearn import mixture

In [None]:
data = pd.read_csv('temp_datalab_records_linkedin_company.csv',engine='c', low_memory=False)

In [None]:
def window_frame(df,chunk_size):
    '''
        Splitting the dataframe into sub-dataframes of sizes chunk_size.
        Return the window-stamps (intervals) and the list of sub-dataframes
        :params df, chunk_size: dataframe and size of sub-dataframes
    '''
    def index_marks(): return range(1 * chunk_size, (len(df) // chunk_size + 1) * chunk_size, chunk_size)
        
    indices = index_marks()
    window_stamps = []
    frames = np.split(df,indices)
    dico = {}
    for i in range(len(indices)):
        dico['$W^'+str(chunk_size)+'_'+str(i+1)+'$'] = frames[i]
        window_stamps.append('$W^'+str(chunk_size)+'_'+str(i+1)+'$')
    
    return window_stamps, dico

def added_vals(list_values):
    '''
        Calculate the differentiation with lag == 1
    '''
    current = []
    for i in range(len(list_values)):
        if i == 0: current.append(0)
        else: current.append(list_values[i]-list_values[i-1])
    return current

def company_information(df,company):
    '''
        get the number of followers per day for a given company.
    '''
    
    df2 = df[df.company_name == company]
    df2['added_followers_count'] = added_vals(df2.followers_count.values)
    df2['added_employees_on_platform'] = added_vals(df2.employees_on_platform.values)
    return df2.loc[:, ['as_of_date','company_name','followers_count',
                       'added_followers_count','employees_on_platform','added_employees_on_platform']]

def get_all_company_followers(df,companies):
    '''
        Get the overall company number of followers
        :params df, companies: dataframe and the list of companies
    '''
    list_frames = []
    #list_frames = Parallel(n_jobs=2)(delayed(company_information)(df,c) for c in companies)
    for c in companies:
        info = company_information(df,c)
        info[c] = info.added_followers_count
        info.set_index('as_of_date',inplace=True)
        list_frames.append(info.loc[:,[c]])
    whole_frame = pd.concat(list_frames, axis=1)
    whole_frame.fillna(value=0,inplace=True)
    return whole_frame

def scale_series(df,range_interval):
    
    '''
        Scaled in dataframe information within a range interval
        :params df, range_interval: dataframe and tuple of 2 values (x,y), x<y
    '''
    
    def scale_data(data):
        scaler = MinMaxScaler(feature_range=range_interval)
        scaler = scaler.fit(data)
        scaled = scaler.transform(data)
        return scaled
    
    df2 = deepcopy(df)
    
    for h in list(df):
        df2[h] = scale_data(np.matrix(df[h]).T)
        
    return df2

def clustering(df):
    #FINCH clustering
    return FINCH(np.array(df),verbose=False)

def get_nbre_components2(df):
    return len(FINCH(np.array(df),verbose=False)[1])

def scanning2(list_frames):
    '''
        Calculate the variation score from a according to the list of dataframes
    '''
    pool = mp.Pool()
    results = pool.map(get_nbre_components2, list_frames)
    pool.terminate()
    dyn = max([results[i] for i in range(len(results))]) / len(list(list_frames[0]))
    return dyn

def get_nbre_components3(df,k,seed):
    X = np.array(df)
    gmm = mixture.GaussianMixture(n_components = k,covariance_type='full',random_state=seed)
    gmm.fit(X)
    means = gmm.means_
    variances = gmm.covariances_
        
    return means, variances

def entropy_cut_off(scores):
    '''
        Identifying a threshold from a given list of scores
    '''
    E = {}
    F = {}
    entropy_E = {}
    entropy_F = {}
    code = {}
    result = None
    for i in range(2,len(scores)):
        E_temp = scores[0:i]
        F_temp = scores[i:len(scores)]
        entropy_E[i-1] = stats.entropy(E_temp)/len(E_temp)
        entropy_F[i] = stats.entropy(F_temp)/len(F_temp)
        E[i] = E_temp
        F[i] = F_temp
        code[i] = np.abs(entropy_E[i-1] - entropy_F[i])
    minim = min(list(code.values()))
    for i in sorted(list(code.keys())):
        if code[i]<0.042:
            result = (i,E[i],F[i],code[i])
            break
    return result, code, entropy_E, entropy_F

def plot_company_followers(df,company):
    
    company_info = company_information(df,company).loc[:, ['as_of_date','added_followers_count']]
    company_info.set_index('as_of_date',inplace=True)
    fig = plt.figure(figsize=(12,4),dpi=125)
    axe = fig.add_subplot(111)
    company_info.plot(ax=axe, lw=1.5,alpha=1)
    plt.show()
    
def plot_company_employees(df,company):
    
    company_info = company_information(df,company).loc[:, ['as_of_date','added_employees_on_platform']]
    company_info.set_index('as_of_date',inplace=True)
    fig = plt.figure(figsize=(12,4),dpi=125)
    axe = fig.add_subplot(111)
    company_info.plot(ax=axe, lw=1.5,alpha=1)
    plt.show()

In [None]:
companies = list(data.company_name.unique())

In [None]:
dates = data.as_of_date.unique()

In [None]:
whole_data = get_all_company_followers(data,companies[0:10])

In [None]:
scaled_whole_data = scale_series(whole_data,(0,1))
scaled_whole_data.plot()
plt.show()

In [None]:
window_sizes = []
nbre_components = {}
dynamic = {}
step_forward = 7
size = 5
cpt = 0
window_sizes.append(size)
window_instances, dico_fr = window_frame(scaled_whole_data,size)
dyn = scanning2([fr.T for fr in dico_fr.values()])
dynamic[size] = dyn
print(dyn, end=';')
while size<len(scaled_whole_data):
    size += step_forward
    window_sizes.append(size)
    window_instances, dico_fr = window_frame(scaled_whole_data,size)
    #window_instances, dico_fr = get_overlapping_window_instances2(test,size,size//2)
    current_dyn = scanning2([fr.T for fr in dico_fr.values()])
    dynamic[size] = current_dyn
    print(current_dyn, end=';')
    if np.abs(current_dyn - dyn)<.001:
        break
    else:
        dyn = deepcopy(current_dyn) 

In [None]:
#Threshold identification

fig = plt.figure(figsize=(4,4),dpi=150)
sns.set_style('ticks')
axe = fig.add_subplot(111)
axe.set_facecolor('white')
axe.plot(window_sizes, [dynamic[s] for s in window_sizes],'k-o',lw=.7,alpha=1,markersize=3)
result, code, entropy_E, entropy_F = entropy_cut_off(np.array([dynamic[s] for s in window_sizes]))
#result, code, entropy_E, entropy_F = entropy_cut_off(np.array(touse))

tresh = window_sizes[result[0]]


axe.vlines(x=tresh,ymin=0,ymax=max(list(dynamic.values()))-.1,
           color='red',label='Optimal window: '+str(tresh))
axe.xaxis.set_tick_params(direction='in',top=False,bottom=True,length=8,color='k',labelsize=13,pad=2)
axe.yaxis.set_tick_params(direction='in',top=False,bottom=True,length=8,color='k',labelsize=13,pad=2)
axe.minorticks_on()
axe.set_ylabel('Series variation',weight='semibold',fontsize=18)
axe.set_xlabel('Window size',weight='semibold',fontsize=18)
axe.legend(loc='upper right',fontsize=20,prop={'weight':'bold'})
plt.xticks(weight='semibold')
plt.yticks(weight='semibold')
axe.text(7,.6,'Sparse area',color='darkgreen',fontsize=10,weight='semibold')
axe.text(tresh+5,.2,'Dense area',color='darkred',fontsize=10,weight='semibold')

plt.show()

In [None]:
def get_mean_variance(rho,data,nbre_reg,seed):
    window_instances2, dico_fr2 = window_frame(data,rho)
    tpfr = dico_fr2[window_instances2[0]]
    tpfr.index = np.arange(len(tpfr))
    head = list(tpfr)
    head2 = deepcopy(head)
    for h in head2:
        arr = np.array(tpfr[h])
    taje = tpfr.loc[:, head]
    for i in range(1,len(window_instances2)):
        tpfr = dico_fr2[window_instances2[i]]
        tpfr.index = np.arange(len(tpfr))
        head = list(tpfr)
        head2 = deepcopy(head)
        for h in head2:
            arr = np.array(tpfr[h])
        tpfr = tpfr.loc[:, head]
        taje = pd.concat([taje,tpfr],axis=1)
    mean,variance = get_nbre_components3(taje.T,nbre_reg,seed)
    return mean,variance
def get_final_regimes(mean,variance,seed):
    patterns = {}
    cpt = 1
    for i in range(len(mean)):
        m,v = mean[i,:],variance[i]
        vals = stats.multivariate_normal.rvs(m,v,size=1,random_state=seed)
        patterns['R'+str(cpt)+'_'+str(i)] = vals
        cpt += 1
    patterns = pd.DataFrame(patterns)
    new_patterns = scale_series(patterns, (1,2))
    g = get_network_from_series(new_patterns)[0]
    print(g.degree())
    print(np.median(g.degree()))
    index = np.where(np.array(g.degree())<=np.median(g.degree()))
    if len(index[0]) == len(list(patterns)):
        return patterns
    else:
        final_patterns = {}
        cpt = 1
        for i in index[0]:
            arr = np.array(patterns[list(patterns)[i]])
            final_patterns['R'+str(cpt)+'_'+str(i)] = arr
            cpt +=1
        final_patterns = pd.DataFrame(final_patterns)
        return final_patterns

rho = 24
K = int(result[1][len(result[1])-1]*rho)
print(K)
seed = 11
mean,variance = get_mean_variance(rho,test,K,seed)
final_patterns = get_final_regimes(mean,variance,seed)
    
fig = plt.figure(figsize=(10,2),dpi=125)
for i in range(len(list(final_patterns))):
    sns.set_style('ticks')
    axe = fig.add_subplot(1,len(list(final_patterns)),i+1)
    show = scale_series(final_patterns, (0,1))
    show[list(final_patterns)[i]].plot(color='k',linestyle='-',lw=.75,alpha=1,ax=axe)
    #axe.vlines(x=0,ymin=min(arr),ymax=max(arr),linestyles='-',lw=1,alpha=1)
    #axe.hlines(y=min(arr),xmin=0,xmax=75,linestyles='-',lw=1,alpha=1)
    axe.xaxis.set_tick_params(direction='in',top=False,bottom=True,length=2,color='k',labelsize=8,pad=2)
    axe.yaxis.set_tick_params(direction='in',top=False,bottom=True,length=2,color='k',labelsize=8,pad=2)
    axe.set_facecolor('white')
    plt.xticks(weight='semibold')
    plt.yticks(weight='semibold')
    if i == 0: axe.set_ylabel('COmpanies',weight='semibold')
plt.subplots_adjust(wspace=.23)
plt.show()