# Process issues for classification w.4

In [1]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.1 MB/s eta 0:00:011
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 11.4 MB/s eta 0:00:01
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 136.6 MB/s eta 0:00:01
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 110.8 MB/s eta 0:00:01
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.20.1-py3-none-any.whl (330 kB)
[K     |████████████████████████████████| 330 kB 145.9 MB/s eta 0:00:01
[?25hCollecting filelock
  Downloading filelock-3.13.1-py3-none-any.

In [8]:
import sys
import os
import pandas as pd
import numpy as np
import torch
from typing import Dict

In [21]:
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer

In [3]:
Issues = {
     'P1': 'P1-g4-c',
     'P2': 'P2-g4-c',
     'P3': 'P3-g4-c',
     'P4': 'P4-g4-c',
     'P5': 'P5-g4-c',
     'P6': 'P6-g4-c',
}


datasets ={
    'Issues': Issues, 
}

datasets_path = "../Data" ## DATA NOT PUBLISHED DUE TO CONFIDENTIONALITY RESTRICTIONS

## Feature extraction

In [24]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [147]:
Embedder

class Embedder:
    
    def __init__(self, model_id, max_tokens=512):
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.model = AutoModel.from_pretrained(model_id).to(device)
    
    def _pooling(self, outputs: torch.Tensor, inputs: Dict,  strategy: str = 'cls') -> np.ndarray:
        if strategy == 'cls':
            outputs = outputs[:, 0]
        elif strategy == 'mean':
            outputs = torch.sum(
                outputs * inputs["attention_mask"][:, :, None], dim=1) / torch.sum(inputs["attention_mask"])
        else:
            raise NotImplementedError
        return outputs.detach().cpu().numpy()
    

    def embed(self, doc):
        inputs = self.tokenizer(doc, max_length=self.max_tokens, padding=True, truncation=True, 
                                    stride=64, return_overflowing_tokens=True,
                                   return_tensors='pt')
        del inputs['overflow_to_sample_mapping']
        for k, v in inputs.items():
            inputs[k] = v.cuda()
        outputs = self.model(**inputs).last_hidden_state
        embeddings = self._pooling(outputs, inputs, 'cls')
        return embeddings.max(axis=0)

In [148]:
#model_id = 'sentence-transformers/all-MiniLM-L6-v2'
model_id = 'mixedbread-ai/mxbai-embed-large-v1'
max_tokens = 512
embedder = Embedder(model_id, max_tokens=max_tokens)

In [None]:
features = {}
anonym = 0

for dataset_name in datasets:
    dataset = datasets[dataset_name]
    for project, repo in list(dataset.items()):
        print(f"{project}")
    
        data_path = f"{datasets_path}/{repo}.csv"
        print(data_path)
        
        data = pd.read_csv(data_path, sep='$')
        emb_title = np.array([embedder.embed(x).reshape(-1) for x in data["title"]])
        emb_desc = np.array([embedder.embed(x).reshape(-1) for x in data["description"].fillna('')])
        features = np.concatenate((emb_title, emb_desc), axis=1)
        df = pd.DataFrame(features);
        df = df.add_prefix("emb")
        df["id"] = data["id"]
        df["class_name"] = data["class_name"]
        df["class_value"] = data["class_value"]
        df["type_value"] = data["type_value"]
        df["creator"] = data["creator"]
        df["assignee"] = data["assignee"]
        if anonym:
            df["contents"] = data["id"].astype(str) + ":" + data["type_name"] \
            + ":" + data["effort"].astype(str) + "h" 
        else:
            df["contents"] = data["id"].astype(str) + ":" + data["type_name"] \
            + ":" + data["effort"].astype(str) + "h " + data["users"] + ":" \
            + " - " + data["title"] + " - " + data["description"].fillna('')
            
        dummies = pd.get_dummies(df['type_value'], prefix="type")
        df = pd.concat([df, dummies], axis=1)
        df.drop(['type_value'], inplace=True, axis=1)
        
        
        out_data_path = f"../Data/{repo}-emb.csv"
        print(out_data_path)
        df.to_csv(out_data_path, sep='$', index=False)
   