In [None]:
model_name = "bloomz-7b1mt"
from transformers import AutoTokenizer,AutoModelForCausalLM,GenerationConfig,AutoModel
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")
MODEL = 'bloomz'

In [None]:
from baukit import Trace, TraceDict

def get_out_bloomz(model, prompt, device,index): 

    model.eval()
    MLP_act = [f"transformer.h.{i}.mlp.gelu_impl" for i in range(30)]
    
    with torch.no_grad():
        with TraceDict(model, MLP_act) as ret:
            output = model(prompt, output_hidden_states = True,output_attentions=True)
        MLP_act_value = [ret[act_value].output for act_value in MLP_act]
        return MLP_act_value
    
    
def act_bloom(input_ids):
    mlp_act = get_out_bloomz(model,input_ids,model.device,-1)
    mlp_act = np.array(mlp_act)
    return mlp_act

if 'bloom' in MODEL:
    LAYERS = model.config.n_layer
    Neuron_num = 16384


In [None]:
langs = ['en','de','es','fr','ru','th','tr','vi','zh','pt']
colors = ['cornflowerblue','forestgreen','orange','red','mediumturquoise','midnightblue','brown','moccasin','darkviolet','gold','deeppink','gray','teal','slateblue']

## XNLI data

In [None]:
def load_data_XNLI():
    with open("./mnli/xnli.test.tsv") as f:
        lines = f.readlines()
    copora = [[],[],[],[],[]]
    for line in lines[1:]:
        line = line.split('\t')
        lang,label,sent1,sent2 = line[0],line[1],line[6],line[7]
        if lang in langs:
            ind = langs.index(lang)
            res = (label,sent1,sent2)
            copora[ind].append(res)
        
    length = len(copora[0])
    question_all, answer_all = [],[]
    for i in range(length):
        question_all.append([])
        answer_all.append([])
    for ind in range(length):
        for l in range(len(copora)):
            tup = copora[l]
            label,premise,hypothesis = tup[ind][0],tup[ind][1],tup[ind][2]
            prompt = f'Take the following as truth: {premise}\nThen the following statement: "{hypothesis}" is "true", "false", or "inconclusive"?'
            question_all[ind].append(prompt)
            answer_all[ind].append(label)
    question_all, answer_all = question_all[:1000], answer_all[:1000]    
    return question_all,answer_all

## Attribution

In [None]:
from torch.nn import functional as F
import numpy

class ModelingRequests():
    def __init__(self):
        self.model = model
        self.device = 'cuda'
        self.tokenizer = tokenizer        
        self.TOP_K = 800
        self.wte = model.transformer.word_embeddings.weight
        
    def cal_prompt(self,prompt):
        self.to_hidden_states, self.to_preds = self.get_preds_and_hidden_states(prompt)
        
    def set_hooks_gpt2(self):
        final_layer = self.model.config.n_layer - 1

        for attr in ["activations_"]:
            if not hasattr(self.model, attr):
                setattr(self.model, attr, {})

        def get_activation(name):
            def hook(module, input, output):
                if "mlp" in name or "attn" in name or "m_coef" in name:
                    if "attn" in name:
                        num_tokens = list(output[0].size())[1]
                        self.model.activations_[name] = output[0][:, num_tokens - 1].detach()
                    elif "mlp" in name:
                        num_tokens = list(output[0].size())[0]  # [num_tokens, 3072] for values;
                        self.model.activations_[name] = output[0][num_tokens - 1].detach()
                    elif "m_coef" in name:
                        num_tokens = list(input[0].size())[1]  # (batch, sequence, hidden_state)
                        self.model.activations_[name] = input[0][:, num_tokens - 1].detach()
                elif "residual" in name or "embedding" in name:
                    num_tokens = list(input[0].size())[1]  # (batch, sequence, hidden_state)
                    if name == "layer_residual_" + str(final_layer):
                        self.model.activations_[name] = self.model.activations_[
                                                            "intermediate_residual_" + str(final_layer)] + \
                                                        self.model.activations_["mlp_" + str(final_layer)]

                    else:
                        self.model.activations_[name] = input[0][:,
                                                        num_tokens - 1].detach()

            return hook

        self.model.transformer.h[0].input_layernorm.register_forward_hook(get_activation("input_embedding"))

        for i in range(self.model.config.n_layer):
            if i != 0:
                self.model.transformer.h[i].input_layernorm.register_forward_hook(get_activation("layer_residual_" + str(i - 1)))
            self.model.transformer.h[i].post_attention_layernorm.register_forward_hook(get_activation("intermediate_residual_" + str(i)))

            self.model.transformer.h[i].self_attention.register_forward_hook(get_activation("attn_" + str(i)))
            self.model.transformer.h[i].mlp.register_forward_hook(get_activation("mlp_" + str(i)))
            self.model.transformer.h[i].mlp.dense_4h_to_h.register_forward_hook(get_activation("m_coef_" + str(i)))

        self.model.transformer.ln_f.register_forward_hook(get_activation("layer_residual_" + str(final_layer)))

    def get_resid_predictions(self,sentence, start_idx=None, end_idx=None, set_mlp_0=False):
        HIDDEN_SIZE = self.model.config.hidden_size

        layer_residual_preds = []
        intermed_residual_preds = []

        if start_idx is not None and end_idx is not None:
            tokens = [
                token for token in sentence.split(' ')
                if token not in ['', '\n']
            ]

            sentence = " ".join(tokens[start_idx:end_idx])
        tokens = self.tokenizer(sentence, return_tensors="pt")
        tokens.to(self.device)
        output = self.model(**tokens, output_hidden_states=True)
        for layer in self.model.activations_.keys():
            if "layer_residual" in layer or "intermediate_residual" in layer:
                normed = self.model.transformer.ln_f(self.model.activations_[layer])

                logits = torch.matmul(self.model.lm_head.weight, normed.T)

                probs = F.softmax(logits.T[0], dim=-1)

                probs = torch.reshape(probs, (-1,)).detach().cpu().numpy()

                assert np.abs(np.sum(probs) - 1) <= 0.01, str(np.abs(np.sum(probs) - 1)) + layer

                probs_ = []
                for index, prob in enumerate(probs):
                    probs_.append((index, prob))
                top_k = sorted(probs_, key=lambda x: x[1], reverse=True)[:self.TOP_K]
                top_k = [(t[1].item(), self.tokenizer.decode(t[0])) for t in top_k]
            if "layer_residual" in layer:
                layer_residual_preds.append(top_k)
            elif "intermediate_residual" in layer:
                intermed_residual_preds.append(top_k)

            for attr in ["layer_resid_preds", "intermed_residual_preds"]:
                if not hasattr(self.model, attr):
                    setattr(self.model, attr, [])

            self.model.layer_resid_preds = layer_residual_preds
            self.model.intermed_residual_preds = intermed_residual_preds

    def get_preds_and_hidden_states(self,prompt):
        self.set_hooks_gpt2()

        sent_to_preds = {}
        sent_to_hidden_states = {}
        sentence = prompt[:]
        self.get_resid_predictions(sentence)
        sent_to_preds["layer_resid_preds"] = self.model.layer_resid_preds
        sent_to_preds["intermed_residual_preds"] = self.model.intermed_residual_preds
        sent_to_hidden_states = self.model.activations_.copy()

        return sent_to_hidden_states, sent_to_preds

    def process_and_get_data(self,prompt):
        sent_to_hidden_states, sent_to_preds = self.get_preds_and_hidden_states(prompt)
        records = []
        top_coef_idx = []
        top_coef_vals = []
        residual_preds_probs = []
        residual_preds_tokens = []
        layer_preds_probs = []
        layer_preds_tokens = []
        for LAYER in range(self.model.config.n_layer):
            coefs_ = []
            m_coefs = sent_to_hidden_states["m_coef_" + str(LAYER)].squeeze(0).cpu().numpy()
            res_vec = sent_to_hidden_states["layer_residual_" + str(LAYER)].squeeze(0).cpu().numpy()
            value_norms = torch.linalg.norm(self.model.transformer.h[LAYER].mlp.dense_4h_to_h.weight.t().data, dim=1).cpu()
            # scaled_coefs = np.absolute(m_coefs) * value_norms.numpy()
            scaled_coefs = m_coefs

            for index, prob in enumerate(scaled_coefs):
                coefs_.append((index, prob))

            top_values = sorted(coefs_, key=lambda x: x[1], reverse=True)[:self.TOP_K]
            c_idx, c_vals = zip(*top_values)
            top_coef_idx.append(c_idx)
            top_coef_vals.append(c_vals)
            residual_p_probs, residual_p_tokens = zip(*sent_to_preds['intermed_residual_preds'][LAYER])
            residual_preds_probs.append(residual_p_probs)
            residual_preds_tokens.append(residual_p_tokens)

            layer_p_probs, layer_p_tokens = zip(*sent_to_preds['layer_resid_preds'][LAYER])
            layer_preds_probs.append(layer_p_probs)
            layer_preds_tokens.append(layer_p_tokens)

        return {
            "sent": prompt,
            "top_coef_idx": top_coef_idx,
            "top_coef_vals": top_coef_vals,
            "residual_preds_probs": residual_preds_probs,
            "residual_preds_tokens": residual_preds_tokens,
            "layer_preds_probs": layer_preds_probs,
            "layer_preds_tokens": layer_preds_tokens,
            "layer_residual_vec": res_vec,
        }
    
    def contribution(self, indexs): # indexs应该是二维的，每层需要计算的neuron id
        # sent_to_hidden_states, sent_to_preds = self.get_preds_and_hidden_states(prompt)
        top_contribution_idx = []
        top_contribution_vals = []
        for LAYER in range(self.model.config.n_layer):
            index_layer = indexs[LAYER]
            coefs_ = []
            m_coefs = self.to_hidden_states["m_coef_" + str(LAYER)].squeeze(0).cpu().numpy()
            value_norms = torch.linalg.norm(self.model.transformer.h[LAYER].mlp.dense_4h_to_h.weight.t().data, dim=1).cpu()
            scaled_coefs = np.absolute(m_coefs) * value_norms.numpy()
            # scaled_coefs = m_coefs * value_norms.numpy()
            
            for index, prob in enumerate(scaled_coefs):
                if index in index_layer:
                    coefs_.append((index, prob))
            if len(coefs_) > 0:
                top_values = sorted(coefs_, key=lambda x: x[1], reverse=True)[:self.TOP_K]
                c_idx, c_vals = zip(*top_values)
                top_contribution_idx.append(list(c_idx))
                top_contribution_vals.append(list(c_vals))
            else:
                top_contribution_idx.append([])
                top_contribution_vals.append([])
        return {
            "top_contribution_idx": top_contribution_idx,
            "top_contribution_vals": top_contribution_vals,
        }
    def all_contribution_values(self, indexs): 
        top_contribution_idx = []
        contribution_vals = []
        for LAYER in range(self.model.config.n_layer):
            index_layer = indexs[LAYER]
            coefs_ = []
            m_coefs = self.to_hidden_states["m_coef_" + str(LAYER)].squeeze(0).cpu().numpy()
            value_norms = torch.linalg.norm(self.model.transformer.h[LAYER].mlp.dense_4h_to_h.weight.t().data, dim=1).cpu()
            scaled_coefs = np.absolute(m_coefs) * value_norms.numpy()
            # scaled_coefs = m_coefs * value_norms.numpy()
            
            contribution_vals.append(scaled_coefs)

        return {
            "contribution_vals": contribution_vals,
        }
    def effective_score(self,token_id, neuron_idxs,layer_idxs):
        scores = []
        for LAYER in layer_idxs:
            m_coefs = self.to_hidden_states["m_coef_" + str(LAYER)].squeeze(0).cpu().numpy()
            value = self.model.transformer.h[LAYER].mlp.dense_4h_to_h.weight.t().data.cpu().numpy()
            effect = m_coefs.reshape(16384,1) * value
            # print(np.array(effect).shape)
            score = self.wte[token_id] @ torch.tensor(effect.T).to('cuda')
            scores.append(score.detach().cpu().numpy())
        return scores
    


## Contribution Score

In [None]:
import torch
import numpy as np
from numpy import *

question_all,answer_all = load_data_XNLI()

questions,answers = length_filter(question_all,answer_all)
test_counts = len(questions)
langs_counts = len(questions[0])


ratios = []
request = ModelingRequests()
request.TOP_K=800

for q in range(test_counts):
    print(q)
    sents,anss = questions[q],answers[q]
    mlp_acts,mlp_ups,mlp_ups_acts = [],[],[]
    for sent in sents:
        encodings = tokenizer(sent, return_tensors='pt')
        input_ids = encodings['input_ids'].to('cuda')

        if 'bloom' in MODEL:
            mlp_act = act_bloom(input_ids)
        mlp_acts.append(mlp_act)


    mlp_all_act = []
    for i in range(langs_counts):
        mlp_act = (mlp_acts[i]>0).astype(int)
        mlp_all_act.append(mlp_act)
    mlp_all = np.sum(mlp_all_act,axis=0)

    all_activated = []
    for i in range(langs_counts):   
        activated = []
        for ly in range(LAYERS):
            act = torch.nonzero(torch.tensor(mlp_all_act[i][ly]==1)).squeeze(1).detach().cpu().numpy()
            activated.append(act) 
        all_activated.append(activated)

    all_shared = []
    for ly in range(LAYERS):
        mlp_all_act_inter = torch.nonzero(torch.tensor(mlp_all[ly]==langs_counts)).squeeze(1).detach().cpu().numpy()
        all_shared.append(mlp_all_act_inter)


    non = []
    mlp_all = np.sum(mlp_all_act,axis=0)
    for ly in range(LAYERS):
        mlp_all_act_inter = torch.nonzero(torch.tensor(mlp_all[ly]==0)).squeeze(1).detach().cpu().numpy()
        non.append(mlp_all_act_inter)  


    specific = []
    for i in range(langs_counts):       
        indices_arr1 = np.where(mlp_all_act[i] == 1)
        indices_arr2 = np.where(mlp_all == 1)
        set_indices_arr1 = set(zip(indices_arr1[0], indices_arr1[1]))
        set_indices_arr2 = set(zip(indices_arr2[0], indices_arr2[1]))
        intersection = set_indices_arr1.intersection(set_indices_arr2)
        if len(intersection) > 0:
            rows_intersection, cols_intersection = zip(*intersection)
        else:
            rows_intersection, cols_intersection = [], []
        row = [[] for _ in range(LAYERS)]
        for k in range(len(rows_intersection)):
            r,c = rows_intersection[k],cols_intersection[k]
            row[r].append(c)
        specific.append(row)


    all_specific = []
    for ly in range(LAYERS):
        specific_ly = []
        for lg in range(langs_counts):
            specific_ly += specific[lg][ly]
        all_specific.append(specific_ly)

    partial_shared = []
    all = [i for i in range(16384)]

    for ly in range(LAYERS):
        other = all_shared[ly].tolist() + non[ly].tolist() + all_specific[ly]
        some = list((set(all)-set(other)))
        partial_shared.append(some)

    lang_ratio = [] 
    for ind in range(len(sents)):
        sent = sents[ind]        

        request.cal_prompt(sent)
        indexs = [[i for i in range(16384)] for _ in range(LAYERS)]
        cont_id = request.contribution(indexs)['top_contribution_idx']
        ratio = [] 
        for ly in range(LAYERS):
            ratio_layer = [0,0,0,0]
            cont_layer_ids = cont_id[ly]
            all_shared_layer = all_shared[ly]
            non_layer = non[ly]
            specific_layer = all_specific[ly]
            partial_shared_layer = partial_shared[ly]
            for c_id in cont_layer_ids:
                if c_id in all_shared_layer:
                    ratio_layer[0] +=1
                elif c_id in partial_shared_layer:
                    ratio_layer[1] +=1
                elif c_id in specific_layer:
                    ratio_layer[2] +=1
                elif c_id in non_layer:
                    ratio_layer[3] +=1
            ratio_layer = np.array(ratio_layer)/800
            ratio.append(ratio_layer.tolist())
        lang_ratio.append(ratio)
    ratios.append(lang_ratio)
ratios = np.mean(ratios,axis=0)
ratios = np.array(ratios)*100

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator

legend_font = {
    'style': 'normal',
    'size': 9,  
    'weight': "bold", 
}


all_shared_data = ratios[:,:,0]
partial_shared_data = ratios[:,:,1]
specific_data = ratios[:,:,2]
non_data = ratios[:,:,3]

xLabel=[i+1 for i in range(LAYERS)]
fig = plt.figure(figsize=(10,5))
x = [i+1 for i in range(LAYERS)]

for i in range(1, langs_counts+1):
    plt.subplot(2, 5, i)
    plt.bar(x,all_shared_data[i-1],color="#59A14F",label="All-shared Neuron")
    plt.bar(x,partial_shared_data[i-1],color="#F28E2B",bottom=np.array(all_shared_data[i-1]),label="Partial-shared Neuron")
    plt.bar(x,specific_data[i-1],color="#E15759",bottom=np.array(all_shared_data[i-1])+np.array(partial_shared_data[i-1]),label="Specific Neuron")
    plt.bar(x,non_data[i-1],color="#A0CBE8",bottom=np.array(all_shared_data[i-1])+np.array(partial_shared_data[i-1])+np.array(specific_data[i-1]),label="Non-activated Neuron")
    plt.title(langs[i-1],fontsize=9,fontweight='bold')
    plt.xticks(xLabel,fontsize=6.7,fontweight='bold')
    plt.yticks(fontsize=6.7,fontweight='bold')
    ax = plt.gca()
    ax.xaxis.set_major_locator(MultipleLocator(5))
    ax.yaxis.set_major_locator(MultipleLocator(20))

fig.text(0.5, 0.045, 'Layers', ha='center',fontsize=9,fontweight='bold')
fig.text(0.08, 0.5, 'Contribution Ratio', va='center', rotation='vertical',fontsize=9,fontweight='bold')
fig.subplots_adjust(wspace=0.23,hspace=0.25) 

lines, labels = fig.axes[0].get_legend_handles_labels()

fig.legend(lines, labels, bbox_to_anchor=(0.5,-0.03), loc=8,prop=legend_font,ncol=4)
plt.show()



In [None]:
import torch
import numpy as np
from numpy import *


question_all,answer_all = load_data_XNLI()

questions,answers = length_filter(question_all,answer_all)
test_counts = len(questions)
langs_counts = len(questions[0])


ratios = []
as_mams,ps_mams,ls_mams,non_mams = [],[],[],[]
request = ModelingRequests()


for q in range(test_counts):
    print(q)
    sents,anss = questions[q],answers[q]
    mlp_acts,mlp_ups,mlp_ups_acts = [],[],[]
    for sent in sents:
        encodings = tokenizer(sent, return_tensors='pt')
        input_ids = encodings['input_ids'].to('cuda')

        attention_mask = encodings['attention_mask'].to('cuda')

        if 'bloom' in MODEL:
            mlp_act = act_bloom(input_ids)
        mlp_acts.append(mlp_act)


    mlp_all_act = []
    for i in range(langs_counts):
        mlp_act = (mlp_acts[i]>0).astype(int)
        mlp_all_act.append(mlp_act)
    mlp_all = np.sum(mlp_all_act,axis=0)    

    all_activated = []
    for i in range(langs_counts):   
        activated,activated_count = [],[]
        for ly in range(LAYERS):
            act = torch.nonzero(torch.tensor(mlp_all_act[i][ly]==1)).squeeze(1).detach().cpu().numpy()
            activated.append(act) 
            activated_count.append(len(act))
        all_activated.append(activated)

    all_shared = []
    for ly in range(LAYERS):
        mlp_all_act_inter = torch.nonzero(torch.tensor(mlp_all[ly]==langs_counts)).squeeze(1).detach().cpu().numpy()
        all_shared.append(mlp_all_act_inter)

    non = []
    mlp_all = np.sum(mlp_all_act,axis=0)
    for ly in range(LAYERS):
        mlp_all_act_inter = torch.nonzero(torch.tensor(mlp_all[ly]==0)).squeeze(1).detach().cpu().numpy()
        non.append(mlp_all_act_inter)  

    specific = []
    for i in range(langs_counts):       
        indices_arr1 = np.where(mlp_all_act[i] == 1)
        indices_arr2 = np.where(mlp_all == 1)
        set_indices_arr1 = set(zip(indices_arr1[0], indices_arr1[1]))
        set_indices_arr2 = set(zip(indices_arr2[0], indices_arr2[1]))
        intersection = set_indices_arr1.intersection(set_indices_arr2)
        if len(intersection) > 0:
            rows_intersection, cols_intersection = zip(*intersection)
        else:
            rows_intersection, cols_intersection = [],[]
        row = [[] for _ in range(LAYERS)]
        for k in range(len(rows_intersection)):
            r,c = rows_intersection[k],cols_intersection[k]
            row[r].append(c)
        specific.append(row)


    all_specific = []
    for ly in range(LAYERS):
        temp = []
        for i in range(langs_counts):
            temp += specific[i][ly]
        all_specific.append(temp)


    partial_shared = []
    for lg in range(langs_counts):
        lg_some = []
        for ly in range(LAYERS):
            other = all_shared[ly].tolist() + specific[lg][ly]
            some = list((set(all_activated[lg][ly])-set(other)))
            lg_some.append(some)
        partial_shared.append(lg_some)

    all_partial_shared = []
    for ly in range(LAYERS):
        temp = []
        for i in range(langs_counts):
            temp += partial_shared[i][ly]
        all_partial_shared.append(list(set(temp)))

    as_lang_mam,ps_lang_mam,ls_lang_mam,non_lang_mam = [],[],[],[]
    for ind in range(len(sents)):
        sent = sents[ind]        

        request.cal_prompt(sent)
        indexs = [[i for i in range(16384)] for _ in range(LAYERS)]
        cont_value = request.all_contribution_values(indexs)['contribution_vals']
        as_values,ps_values,ls_values,non_values = [],[],[],[]
        for ly in range(LAYERS):
            cont_layer_values = np.array(cont_value[ly])
            all_shared_indices = all_shared[ly]
            non_indices = non[ly]
            specific_indices = all_specific[ly]
            partial_shared_indices = all_partial_shared[ly]
            all_shared_value,partial_shared_value,specific_value,non_value = cont_layer_values[all_shared_indices],cont_layer_values[partial_shared_indices],cont_layer_values[specific_indices],cont_layer_values[non_indices]            

            if len(all_shared_value) > 0:
                as_values.append([np.max(all_shared_value),np.mean(all_shared_value),np.min(all_shared_value),np.sum(all_shared_value)])
            else:
                as_values.append([0,0,0,0])
            if len(partial_shared_value) > 0:
                ps_values.append([np.max(partial_shared_value),np.mean(partial_shared_value),np.min(partial_shared_value),np.sum(partial_shared_value)])
            else:
                ps_values.append([0,0,0,0])
            if len(specific_value) > 0:
                ls_values.append([np.max(specific_value),np.mean(specific_value),np.min(specific_value),np.sum(specific_value)])
            else:
                ls_values.append([0,0,0,0])
            if len(non_value) > 0:
                non_values.append([np.max(non_value),np.mean(non_value),np.min(non_value),np.sum(non_value)])
            else:
                non_values.append([0,0,0,0])
        as_lang_mam.append(as_values)
        ps_lang_mam.append(ps_values)
        ls_lang_mam.append(ls_values)
        non_lang_mam.append(non_values) #10*30*3
    as_mams.append(as_lang_mam)
    ps_mams.append(ps_lang_mam)
    ls_mams.append(ls_lang_mam)
    non_mams.append(non_lang_mam)


as_mams,ps_mams,ls_mams,non_mams = np.mean(as_mams,axis=0),np.mean(ps_mams,axis=0),np.mean(ls_mams,axis=0),np.mean(non_mams,axis=0)
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator

legend_font = {
    'style': 'normal',
    'size': 9,  # 字号
    'weight': "bold",  # 是否加粗，不加粗
}


avg_as_mams,avg_ps_mams,avg_ls_mams,avg_non_mams = as_mams[:,:,1],ps_mams[:,:,1],ls_mams[:,:,1],non_mams[:,:,1]
sum_as_mams,sum_ps_mams,sum_ls_mams,sum_non_mams = as_mams[:,:,3],ps_mams[:,:,3],ls_mams[:,:,3],non_mams[:,:,3]

xLabel=[i+1 for i in range(LAYERS)]
fig = plt.figure(figsize=(10,5))
x = [i+1 for i in range(LAYERS)]

for i in range(1, langs_counts+1):
    plt.subplot(2, 5, i)
    plt.plot(x,np.array(avg_as_mams[i-1]),color="#59A14F",marker='s',markersize='2',label="All-shared Neuron")   
    plt.plot(x,np.array(avg_ps_mams[i-1]),color="#F28E2B",label="Partial-shared Neuron")
    plt.plot(x,np.array(avg_ls_mams[i-1]),color="#E15759",label="Specific Neuron")
    plt.plot(x,np.array(avg_non_mams[i-1]),color="#A0CBE8",label="Non-activated Neuron")
    plt.title(langs[i-1],fontsize=9,fontweight='bold')
    plt.xticks(xLabel,fontsize=6.7,fontweight='bold')
    plt.yticks(fontsize=6.7,fontweight='bold')
    ax = plt.gca()
    ax.xaxis.set_major_locator(MultipleLocator(5))

fig.text(0.5, 0.045, 'Layers', ha='center',fontsize=9,fontweight='bold')
fig.text(0.07, 0.5, 'Average Contribution Score', va='center', rotation='vertical',fontsize=9,fontweight='bold')
fig.subplots_adjust(wspace=0.3,hspace=0.25) 

lines, labels = fig.axes[0].get_legend_handles_labels()

fig.legend(lines, labels, bbox_to_anchor=(0.5,-0.03), loc=8,prop=legend_font,ncol=4)
plt.show()


fig = plt.figure(figsize=(10,5))
for i in range(1, langs_counts+1):
    plt.subplot(2, 5, i)
    plt.plot(x,np.array(sum_as_mams[i-1]),color="#59A14F",marker='s',markersize='2',label="All-shared Neuron")   
    plt.plot(x,np.array(sum_ps_mams[i-1]),color="#F28E2B",label="Partial-shared Neuron")
    plt.plot(x,np.array(sum_ls_mams[i-1]),color="#E15759",label="Specific Neuron")
    plt.plot(x,np.array(sum_non_mams[i-1]),color="#A0CBE8",label="Non-activated Neuron")
    plt.title(langs[i-1],fontsize=9,fontweight='bold')
    plt.xticks(xLabel,fontsize=6.7,fontweight='bold')
    plt.yticks(fontsize=6.7,fontweight='bold')
    ax = plt.gca()
    ax.xaxis.set_major_locator(MultipleLocator(5))

fig.text(0.5, 0.045, 'Layers', ha='center',fontsize=9,fontweight='bold')
fig.text(0.07, 0.5, 'Sum of Contribution Score', va='center', rotation='vertical',fontsize=9,fontweight='bold')
fig.subplots_adjust(wspace=0.3,hspace=0.25) 

lines, labels = fig.axes[0].get_legend_handles_labels()

fig.legend(lines, labels, bbox_to_anchor=(0.5,-0.03), loc=8,prop=legend_font,ncol=4)
plt.show()

## Effective Score

In [None]:
import torch
import numpy as np
from numpy import *


    question_all,answer_all = load_data_XNLI()
        
    questions,answers = length_filter(question_all,answer_all)
    test_counts = len(questions)
    langs_counts = len(questions[0])


    ratios = []
    as_mams,ps_mams,ls_mams,non_mams = [],[],[],[]
    request = ModelingRequests_bloom()

    for q in range(test_counts):
        print(q)
        sents,anss = questions[q],answers[q]
        mlp_acts,mlp_ups,mlp_ups_acts = [],[],[]
        for sent in sents:
            encodings = tokenizer(sent, return_tensors='pt')
            input_ids = encodings['input_ids'].to('cuda')

            if 'bloom' in MODEL:
                mlp_act = act_bloom(input_ids)
            mlp_acts.append(mlp_act)


        mlp_all_act = []
        for i in range(langs_counts):
            mlp_act = (mlp_acts[i]>0).astype(int)
            mlp_all_act.append(mlp_act)
        mlp_all = np.sum(mlp_all_act,axis=0)    

        all_activated = []
        for i in range(langs_counts):   
            activated,activated_count = [],[]
            for ly in range(LAYERS):
                act = torch.nonzero(torch.tensor(mlp_all_act[i][ly]==1)).squeeze(1).detach().cpu().numpy()
                activated.append(act) 
                activated_count.append(len(act))
            all_activated.append(activated)

        all_shared = []
        for ly in range(LAYERS):
            mlp_all_act_inter = torch.nonzero(torch.tensor(mlp_all[ly]==langs_counts)).squeeze(1).detach().cpu().numpy()
            all_shared.append(mlp_all_act_inter)

        non = []
        mlp_all = np.sum(mlp_all_act,axis=0)
        for ly in range(LAYERS):
            mlp_all_act_inter = torch.nonzero(torch.tensor(mlp_all[ly]==0)).squeeze(1).detach().cpu().numpy()
            non.append(mlp_all_act_inter)  

        specific = []
        for i in range(langs_counts):       
            indices_arr1 = np.where(mlp_all_act[i] == 1)
            indices_arr2 = np.where(mlp_all == 1)
            set_indices_arr1 = set(zip(indices_arr1[0], indices_arr1[1]))
            set_indices_arr2 = set(zip(indices_arr2[0], indices_arr2[1]))
            intersection = set_indices_arr1.intersection(set_indices_arr2)
            if len(intersection) > 0:
                rows_intersection, cols_intersection = zip(*intersection)
            else:
                rows_intersection, cols_intersection = [],[]
            row = [[] for _ in range(LAYERS)]
            for k in range(len(rows_intersection)):
                r,c = rows_intersection[k],cols_intersection[k]
                row[r].append(c)
            specific.append(row)
            
            
        all_specific = []
        for ly in range(LAYERS):
            temp = []
            for i in range(langs_counts):
                temp += specific[i][ly]
            all_specific.append(temp)
        
            
        partial_shared = []
        for lg in range(langs_counts):
            lg_some = []
            for ly in range(LAYERS):
                other = all_shared[ly].tolist() + specific[lg][ly]
                some = list((set(all_activated[lg][ly])-set(other)))
                lg_some.append(some)
            partial_shared.append(lg_some)

        all_partial_shared = []
        for ly in range(LAYERS):
            temp = []
            for i in range(langs_counts):
                temp += partial_shared[i][ly]
            all_partial_shared.append(list(set(temp)))
            
        as_lang_mam,ps_lang_mam,ls_lang_mam,non_lang_mam = [],[],[],[]    
        for ind in range(len(sents)):
            sent = sents[ind]
            ans = anss[ind]
            q_encodings = tokenizer(sent, return_tensors='pt')['input_ids']
            if ind != 8:
                a_encodings = tokenizer(sent + ' ' + ans, return_tensors='pt')['input_ids']
            else:
                a_encodings = tokenizer(sent + ans, return_tensors='pt')['input_ids']
            reference_id = a_encodings[0][len(q_encodings[0])]

            request.cal_prompt(sent)
            effective_score_value = request.effective_score(reference_id,[i for i in range(30)])
            as_values,ps_values,ls_values,non_values = [],[],[],[]

            
            
            for ly in range(LAYERS):
                effective_score_layer_values = np.array(effective_score_value[ly])
                all_shared_indices = all_shared[ly]
                non_indices = non[ly]
                specific_indices = all_specific[ly]
                partial_shared_indices = all_partial_shared[ly]
                all_shared_value,partial_shared_value,specific_value,non_value = effective_score_layer_values[all_shared_indices],effective_score_layer_values[partial_shared_indices],effective_score_layer_values[specific_indices],effective_score_layer_values[non_indices]    
                
                if len(all_shared_value) > 0:
                    as_values.append([np.max(all_shared_value),np.mean(all_shared_value),np.min(all_shared_value)])
                else:
                    as_values.append([0,0,0])
                if len(partial_shared_value) > 0:
                    ps_values.append([np.max(partial_shared_value),np.mean(partial_shared_value),np.min(partial_shared_value)])
                else:
                    ps_values.append([0,0,0])
                if len(specific_value) > 0:
                    ls_values.append([np.max(specific_value),np.mean(specific_value),np.min(specific_value)])
                else:
                    ls_values.append([0,0,0])
                if len(non_value) > 0:
                    non_values.append([np.max(non_value),np.mean(non_value),np.min(non_value)])
                else:
                    non_values.append([0,0,0])
            as_lang_mam.append(as_values)
            ps_lang_mam.append(ps_values)
            ls_lang_mam.append(ls_values)
            non_lang_mam.append(non_values) #10*30*3
        as_mams.append(as_lang_mam)
        ps_mams.append(ps_lang_mam)
        ls_mams.append(ls_lang_mam)
        non_mams.append(non_lang_mam)
        
    as_mams,ps_mams,ls_mams,non_mams = np.mean(as_mams,axis=0),np.mean(ps_mams,axis=0),np.mean(ls_mams,axis=0),np.mean(non_mams,axis=0)
    
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.pyplot import MultipleLocator
    import matplotlib.ticker as ticker

    legend_font = {
        'style': 'normal',
        'size': 9,  # 字号
        'weight': "bold",  # 是否加粗，不加粗
    }


    max_as_mams,max_ps_mams,max_ls_mams,max_non_mams = as_mams[:,:,0],ps_mams[:,:,0],ls_mams[:,:,0],non_mams[:,:,0]

    maxs = [max_as_mams,max_ps_mams,max_ls_mams,max_non_mams]
    
    fig = plt.figure(figsize=(4,4))
    x = [i+1 for i in range(30)]

    axes = fig.subplots(nrows=2, ncols=2)

    for tp in range(4):
        if tp == 0:
            ax = axes[0, 0]
        if tp == 1:
            ax = axes[0, 1]
        if tp == 2:
            ax = axes[1, 0]
        if tp == 3:
            ax = axes[1, 1]
        for i in range(langs_counts):
            ax.scatter(x, maxs[tp][i], s=20, c=colors[i+2], marker='+',label=langs[i])
            ax.tick_params(axis='both',labelsize=6)


    axes[0, 0].set_title('All-shared Neuron',fontsize=7,fontweight='bold')
    axes[0, 1].set_title('Patrial-shared Neuron',fontsize=7,fontweight='bold')
    axes[1, 0].set_title('Specific Neuron',fontsize=7,fontweight='bold')
    axes[1, 1].set_title('Non-activated Neuron',fontsize=7,fontweight='bold')

    fig.text(0.5, 0.02, 'Layers', ha='center',fontsize=8,fontweight='bold')
    fig.text(0.01, 0.5, 'Effective Score of Neuron', va='center', rotation='vertical',fontsize=8,fontweight='bold')
    fig.subplots_adjust(wspace=0.3,hspace=0.3) 

    axes[0, 0].xaxis.set_major_locator(ticker.MultipleLocator(5))
    axes[0, 1].xaxis.set_major_locator(ticker.MultipleLocator(5))
    axes[1, 0].xaxis.set_major_locator(ticker.MultipleLocator(5))
    axes[1, 1].xaxis.set_major_locator(ticker.MultipleLocator(5))
    lines, labels = fig.axes[-1].get_legend_handles_labels()

    fig.legend(lines, labels, bbox_to_anchor=(0, 0), borderaxespad=0, loc=2, ncol=5,prop=legend_font)
    plt.show()
    
    print("____________________________________________")
    print(as_mams,ps_mams,ls_mams,non_mams)

                
                