In [None]:
import torch
print(torch.cuda.is_available())
print("Num GPUs:", torch.cuda.device_count())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device)
# torch.cuda.empty_cache()
print(f'Using device: {device}, {torch.cuda.get_device_name(device)}')


In [None]:
import spacy
import numpy
print(numpy.__version__)

import cupy

# Load the model
nlp = spacy.load("en_core_web_sm")

# Enable GPU
spacy.require_gpu()

# Test to ensure it's using GPU
doc = nlp("This is a test document.")
print(doc[0].vector)  # Accessing the vector of the first token to make sure all works fine.


In [None]:
# Function to clean and normalize text
def clean_text(text):
    return text.encode('ascii', 'ignore').decode('ascii')




def process_dataframe(df):
    # Modify this based on the actual preprocessing steps needed from the notebook
#     df['manually_label'] = df['manually_label'].map({'positive': 1, 'negative': 0})
    
    dic_list = []
    for index,row in df.iterrows():
        res = row['manually_label'] 

        if res == "positive":
            row['manually_label'] = 1
        elif res == "negative":
            row['manually_label'] = 0

        if res == '1.0':
            row['manually_label'] = 1
        elif res == '0.0':
            row['manually_label'] = 0

        if res == '1':
            row['manually_label'] = 1
        elif res == '0':
            row['manually_label'] = 0

        dic_list.append(row)
    

    
    df_split = pd.DataFrame(dic_list)
#     df_split = a

    df_pos = df_split.loc[df_split['manually_label'] == 1]
    df_neg = df_split.loc[df_split['manually_label'] == 0]
    df_pos1 = df_split.loc[df_split['manually_label'] == '1']
    df_neg1 = df_split.loc[df_split['manually_label'] == '0']

    print("pos:{}, neg:{}".format(df_pos.shape,df_neg.shape))
    print("pos:{}, neg:{}".format(df_pos1.shape,df_neg1.shape))
    
    df= pd.concat([df_pos,df_pos1, df_neg,df_neg1], ignore_index = True)
    df['clean_message'] = df['clean_message'].astype(str)

    df['clean_message'] = df['clean_message'].apply(clean_text)
    
    
    return df

In [None]:
import pandas as pd
import spacy
import networkx as nx
from spacy.tokens import Token
from tqdm.auto import tqdm

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

def build_dependency_graph(commit_message):
    doc = nlp(commit_message)
    graph = nx.DiGraph()
    last_token_of_previous_sentence = None

    for sentence in doc.sents:
        first_token_of_sentence = True
        for token in sentence:
            graph.add_node(token.i, label=token.text, feature=token.vector)
            if token.head != token:
                graph.add_edge(token.head.i, token.i, label=token.dep_)

            if first_token_of_sentence and last_token_of_previous_sentence is not None:
                graph.add_edge(last_token_of_previous_sentence.i, token.i, label='neigh')
                first_token_of_sentence = False

            last_token_of_previous_sentence = token

    return graph

# Load the CSV file
df = pd.read_csv('./train.csv')
df =  process_dataframe(df)

# Apply function with progress bar
tqdm.pandas(desc="Building graphs")
df['graph'] = df['clean_message'].progress_apply(build_dependency_graph)


In [None]:
import torch
from torch_geometric.nn import GatedGraphConv, global_max_pool
from torch_geometric.loader import DataLoader  # Instead of from torch_geometric.data import DataLoader
from torch_geometric.data import Data
import numpy as np

class GGNNModel(torch.nn.Module):
    def __init__(self, node_feature_dim, num_classes):
        super(GGNNModel, self).__init__()
        self.ggnn = GatedGraphConv(node_feature_dim, num_layers=3)
        self.fc = torch.nn.Linear(node_feature_dim, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.ggnn(x, edge_index)
        x = global_max_pool(x, data.batch)  # Max pooling over all nodes
        x = self.fc(x)
        return x

# Assuming node features are already initialized in the graph data
# Prepare for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GGNNModel(node_feature_dim=128, num_classes=2).to(device)  # Example dimensions
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()



In [None]:
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
import numpy as np
import cupy  # Make sure to import cupy

def convert_to_torch_geometric(graph, label):
    if graph is None or len(graph.nodes()) == 0 or len(graph.edges()) == 0:
        print("Empty graph or missing data.")
        return None

    node_features = []
    edge_indices = []

    try:
        # Collect node features and edges
        for node in graph.nodes(data=True):
            feature = node[1].get('feature')
            if feature is None:
                print(f"Missing feature for node {node}")
                return None
            if isinstance(feature, cupy._core.core.ndarray):
                feature = cupy.asnumpy(feature)  # Convert CuPy array to NumPy array
            if isinstance(feature, np.ndarray):
                feature = feature.tolist()  # Convert NumPy array to list
            elif not isinstance(feature, list):
                print(f"Invalid feature type for node {node}: {type(feature)}")
                return None
            node_features.append(feature)

        for edge in graph.edges():
            edge_indices.append([edge[0], edge[1]])

        if not node_features or not edge_indices:
            print("No features or edges found for graph conversion.")
            return None

        # Convert to tensors
        x = torch.tensor(node_features, dtype=torch.float32)
        edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()

        # Create PyTorch Geometric data object
        data = Data(x=x, edge_index=edge_index)
        data.y = torch.tensor([label], dtype=torch.long)
        return data

    except Exception as e:
        print(f"Error converting graph: {e}")
        return None

# Testing the conversion function with sample data
if not df.empty:
    sample_graph = build_dependency_graph(df['clean_message'].iloc[0])
    sample_label = df['manually_label'].iloc[0]
    sample_data = convert_to_torch_geometric(sample_graph, sample_label)
    print(f"Converted sample data: {sample_data}")
else:
    print("DataFrame is empty, check data loading.")


In [None]:
data_list = [convert_to_torch_geometric(g, label) for g, label in zip(df['graph'], df['manually_label']) if g is not None]
data_list = [d for d in data_list if d is not None]  # Filter out None values
print(f"Total graphs converted and not None: {len(data_list)}")


In [None]:
if data_list:
    loader = DataLoader(data_list, batch_size=32, shuffle=True)
    print(f"Data ready for training with {len(data_list)} items.")
else:
    print("No valid graph data available for DataLoader.")


In [None]:
# Check if there is any data to process
if data_list:
    try:
        # Set up a progress bar using tqdm for better visualization of the training process
        from tqdm import tqdm
        
        # Begin the training loop
        for epoch in range(10):  # Number of epochs
            epoch_loss = 0
            total = 0
            for data in tqdm(loader, desc=f"Epoch {epoch+1}/{10}", leave=True):
                data = data.to(device)  # Move data to the appropriate device (GPU or CPU)
                optimizer.zero_grad()  # Clear gradients to prevent them from accumulating
                out = model(data)  # Pass the data through the model
                loss = criterion(out, data.y)  # Compute the loss
                loss.backward()  # Compute the gradient of the loss wrt the parameters (backpropagation)
                optimizer.step()  # Update the parameters based on the gradients
                epoch_loss += loss.item() * data.num_graphs  # Aggregate the loss for this epoch
                total += data.num_graphs

            # Calculate average loss for the epoch
            epoch_loss /= total
            print(f'Average Loss for Epoch {epoch+1}: {epoch_loss:.4f}')
    except Exception as e:
        print(f"Training error occurred: {e}")  # Print any exceptions that occur during training
else:
    print("Training aborted due to no data.")  # Message if there is no data to train on


In [None]:
# Save the model
torch.save(model.state_dict(), './espi_ggnn_model_(js)_(12_07_24).pth')


## Load and test the model

In [None]:
# Load the model
model = GGNNModel(node_feature_dim=128, num_classes=2).to(device)
model.load_state_dict(torch.load('./espi_ggnn_model(12_07_24).pth'))
model.eval()  # Set the model to evaluation mode


In [None]:
import pandas as pd
import torch
from torch_geometric.loader import DataLoader
from tqdm.auto import tqdm

# Function to load and preprocess the test dataset
def load_and_prepare_test_data(file_path):
    # Load the new CSV file
    test_df = pd.read_csv(file_path,encoding='utf-8')
#     test_data = pd.read_csv('./',encoding='utf-8')
    test_df =  process_dataframe(test_df)
    
    # Apply function with progress bar for building graphs
    tqdm.pandas(desc="Building test graphs")
    test_df['graph'] = test_df['clean_message'].progress_apply(build_dependency_graph)

    # Convert networkx graphs to PyTorch Geometric graphs
    test_data_list = [convert_to_torch_geometric(g, label) for g, label in zip(test_df['graph'], test_df['manually_label'])]
    test_data_list = [d for d in test_data_list if d is not None]  # Filter out None values to avoid issues in DataLoader

    # Check if data is properly loaded and converted
    if not test_data_list:
        print("No valid graph data available for DataLoader in test set.")
        return None
    
    # Create a DataLoader for the test data
    test_loader = DataLoader(test_data_list, batch_size=32, shuffle=False)
    print(f"Data ready for testing with {len(test_data_list)} items.")
    return test_loader

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch_geometric.loader import DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, precision_score,classification_report, recall_score, f1_score, confusion_matrix,precision_recall_curve

def evaluate_model_on_test_data(model, test_loader):
    if test_loader is None:
        print("Testing aborted due to no data.")
        return
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()  # Set the model to evaluation mode
    
    all_predictions = []
    all_labels = []
    all_probabilities = []
    
    with torch.no_grad():
        for data in tqdm(test_loader, desc="Testing"):
            data = data.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs, dim=1)
            probabilities = torch.softmax(outputs, dim=1)[:, 1]  # Get the probabilities for the positive class
            all_probabilities.extend(probabilities.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(data.y.cpu().numpy())


    conf_matrix = confusion_matrix(all_labels, all_predictions)
    # Classification report
    report = classification_report(all_labels, all_predictions)
    print("Classification Report:\n", report)



    
    comfmat = pd.DataFrame(confusion_matrix(all_labels, all_predictions), index=['negative', 'positive'],columns=['negative', 'positive'])
    return comfmat



In [None]:
# Example usage remains as it was in the previous script, which ends with:
# test_loader = load_and_prepare_test_data('./test.csv')

test_loader = load_and_prepare_test_data('/20_js_test.csv')



In [None]:
comfmat = evaluate_model_on_test_data(model, test_loader)


In [None]:
comfmat