In [5]:
from openai import OpenAI
import openai
import json
import ollama
import pandas as pd
tools = [
    {
        "type": "function",
        "function": {
            "name": "assign_subject",
            "description": "Assign subject to each article title",
            "parameters": {
                "type": "object",
                "properties": {
                    "subject_1": {
                        "type": "string",
                        "description": "The first closest subject of article"
              },
                    "subject_2": {
                        "type": "string",
                        "description": "The second closest subject of article"
              },
                    "subject_3": {
                        "type": "string",
                        "description": "The third closest subject of article"
              },
                },
                "required": [],
            }
        }
    }
]

# client = OpenAI(
#     base_url = 'http://localhost:11434',
#     api_key='ollama', # required, but unused
# )


candidate_labels = ','.join([
    "Agricultural and Biological Sciences", "Arts and Humanities", "Biochemistry, Genetics and Molecular Biology",
    "Business, Management and Accounting", "Chemical Engineering", "Chemistry", "Computer Science",
    "Decision Sciences", "Dentistry", "Earth and Planetary Sciences", "Economics, Econometrics and Finance",
    "Energy", "Engineering", "Environmental Science", "Health Professions", "Immunology and Microbiology",
    "Materials Science", "Mathematics", "Medicine", "Neuroscience", "Nursing",
    "Pharmacology, Toxicology and Pharmaceutics", "Physics and Astronomy", "Psychology", "Social Sciences",
    "Veterinary"
])

title = "Timing of surgery following SARSâ€CoVâ€2 infection: an international prospective cohort study"


response = ollama.chat(
    model="mmlTOP",# changed temperature = 0.2 and TOP_P = 0.1 of llama3.1:8b to reduce randomness using modelFile
    messages=[
        {"role": "user", "content": f"Assign three closest subjcets from this subjects'{candidate_labels}' according to article title: '{title}'"}
    ],
    tools=tools,
)


### Test llama model on one article's title

In [6]:
response["message"]["tool_calls"][0]["function"]["arguments"].values()

dict_values(['Medicine', 'Health Professions', 'Immunology and Microbiology'])

# classify all the titles using llama 3.1:8b

## and saving each 10'000 batch just in case

In [3]:
import concurrent.futures
import time

#GOLEN CODE
#Have I ever told you the definition of INSANITY?????????????????????

# Function to process each row
def classify_article(row, index):
    title = row['title']
    print(f"Started processing index {index} at {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}")
    start_time = time.time()
    response = ollama.chat(
        model="mmlTOP",
        messages=[
            {"role": "user", "content": f"Assign three closest subjcets from this subjects'{candidate_labels}' according to article title: '{title}'"}
        ],
        tools=tools
    )
    try:
        llama_classification = list(response["message"]["tool_calls"][0]["function"]["arguments"].values())
        llama_classification.append(index) #Add index to make sure the code won't mess up the indexing
    except KeyError:
        print(f"Error processing index {index}: KeyError")
        llama_classification = [index] #Add index to make sure the code won't mess up the indexing
    end_time = time.time()
    print(f"Finished processing index {index} at {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())} (Duration: {end_time - start_time:.2f} seconds)")
    return llama_classification

# OG DataFrame
articles = pd.read_csv('articlesV2.csv')

start_index = 714
counter = 1
# counter <=35
while counter <=35:
    last_index = start_index + 10000
    articles_subset2 = articles.iloc[start_index:last_index]
    
    # Use ThreadPoolExecutor for multithreading
    classifications = []
    start_time = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor:
        futures = {executor.submit(classify_article, row, index): index for index, row in articles_subset2.iterrows()}
        for future in concurrent.futures.as_completed(futures):
            index = futures[future]
            try:
                classifications.append(future.result())
            except Exception as e:
                print(f"Error processing index {index}: {e}")
    
    total_duration = time.time() - start_time
    print(f"Total processing time for 10000 rows: {total_duration:.2f} seconds")
    classifications.sort(key=lambda tup: tup[-1])  # sort the result cause ChatGPT multi thread codes causes chaos in indexes
    articles_subset2['llama_classified'] = classifications
    articles_subset2.to_csv(f'articles_Batch{counter}.csv')
    counter += 1
    start_index = last_index

Started processing index 0 at 2024-08-09 11:46:16
Started processing index 1 at 2024-08-09 11:46:16
Started processing index 2 at 2024-08-09 11:46:16
Started processing index 3 at 2024-08-09 11:46:16
Started processing index 4 at 2024-08-09 11:46:16
Started processing index 5 at 2024-08-09 11:46:16
Started processing index 6 at 2024-08-09 11:46:16
Started processing index 7 at 2024-08-09 11:46:16
Started processing index 8 at 2024-08-09 11:46:16
Started processing index 9 at 2024-08-09 11:46:16
Started processing index 10 at 2024-08-09 11:46:16
Started processing index 11 at 2024-08-09 11:46:16
Finished processing index 3 at 2024-08-09 11:46:17 (Duration: 1.18 seconds)
Started processing index 12 at 2024-08-09 11:46:17
Finished processing index 1 at 2024-08-09 11:46:17 (Duration: 1.21 seconds)
Started processing index 13 at 2024-08-09 11:46:17
Finished processing index 2 at 2024-08-09 11:46:17 (Duration: 1.25 seconds)
Started processing index 14 at 2024-08-09 11:46:17
Finished processi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles_subset2['llama_classified'] = classifications


In [8]:
#while counter <= 35:
concated_articles = pd.read_csv(f'articles_Batch1.csv')
concated_articles2 = pd.read_csv(f'articles_Batch2.csv')
concat_them = pd.concat([concated_articles, concated_articles2])

In [9]:
concat_them

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,GS_link,year,cite,main_authors,more_info,link_ids,classified_categories,llama_classified
0,714,714,Effects of waste-derived ethylene glycol diace...,/citations?view_op=view_citation&hl=en&user=-F...,2020,39,"S Amid, M Aghbashlo, M Tabatabaei, A Hajiahmad...","Energy conversion and management 203, 112245, ...","6, 1213, 1293, 21789, 139","[('Energy', 0.2021443247795105), ('Chemistry',...","['Engineering', 'Chemical Engineering', 'Energ..."
1,715,715,Enhanced power generation and desalination rat...,/citations?view_op=view_citation&hl=en&user=-F...,2020,36,"T Jafary, A Al-Mamun, H Alhimali, MS Baawain, ...","Renewable and Sustainable Energy Reviews 127, ...",6,"[('Energy', 0.2837088406085968), ('Engineering...","['Environmental Science', 'Engineering', 'Chem..."
2,716,716,Exergetic sustainability analysis of municipal...,/citations?view_op=view_citation&hl=en&user=-F...,2022,35,"S Soltanian, SA Kalogirou, M Ranjbari, H Amiri...","Renewable and Sustainable Energy Reviews 156, ...","6, 17766","[('Energy', 0.32668590545654297), ('Environmen...","['Environmental Science', 'Engineering', 'Ener..."
3,717,717,A state-of-the-art review on producing enginee...,/citations?view_op=view_citation&hl=en&user=-F...,2022,35,"WAW Mahari, K Waiho, E Azwar, H Fazhan, W Peng...","Chemosphere 288, 132559, 2022",6,"[('Environmental Science', 0.15326537191867828...","['Environmental Science', 'Chemistry', 'Engine..."
4,718,718,Energy flow modeling and life cycle assessment...,/citations?view_op=view_citation&hl=en&user=-F...,2020,35,"M Khanali, D Kokei, M Aghbashlo, FK Nasab, ...","Journal of Cleaner Production 246, 118997, 2020","6, 139, 221","[('Energy', 0.7923570871353149), ('Environment...","['Environmental Science', 'Engineering', 'Ener..."
...,...,...,...,...,...,...,...,...,...,...,...
9995,20709,20709,The mediating role of working memory and mathe...,/citations?view_op=view_citation&hl=en&user=nl...,2021,0,"NZ Mahdavi, P Kadivar, A ARJMANDNIA, K Pousheneh","JOURNAL OF PSYCHOLOGICAL SCIENCE 20 (98), 269-...",811,"[('Mathematics', 0.2101140320301056), ('Neuros...","['Computer Science', 'Mathematics', 'Decision ..."
9996,20710,20710,Prediction of blood glucose level in patients ...,/citations?view_op=view_citation&hl=en&user=nl...,2020,0,"J Fathabadi, M Haji Ghorbani Doulabi, AA Arjma...","Daneshvar Medicine 28 (2), 40-49, 2020",15355811,"[('Medicine', 0.17488493025302887), ('Decision...","['Medicine', 'Immunology and Microbiology', 'P..."
9997,20711,20711,Prediction of blood glucose level in patients ...,/citations?view_op=view_citation&hl=en&user=nl...,2020,0,"J Fathabadi, M Haji Ghorbani Doulabi, AA Arjma...","Daneshvar Medicine 28 (2), 40-49, 2020",15355811,"[('Medicine', 0.17488493025302887), ('Decision...","['Medicine', 'Pharmacology, Toxicology and Pha..."
9998,20712,20712,FRnet-DTI: Convolutional neural networks for d...,/citations?view_op=view_citation&hl=en&user=PT...,2020,40,"F Rayhan, S Ahmed, Z Mousavian, DM Farid, S Sh...","Heliyon, 2020",812,"[('Neuroscience', 0.46937939524650574), ('Comp...","['Computer Science', 'Engineering', 'Decision ..."


# Cell below is to concat all the csv files

In [3]:
import pandas as pd
import os

# Define the directory where your CSV files are located
csv_dir = ''

# Initialize an empty list to hold DataFrames
df_list = []

# Loop through each file and read them into a DataFrame
for i in range(1, 36):
    file_name = f'articles_Batch{i}.csv'  # Assuming files are named as file1.csv, file2.csv, ..., file35.csv
    file_path = os.path.join(csv_dir, file_name)
    
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Append the DataFrame to the list
    df_list.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
big_df = pd.concat(df_list, ignore_index=True)

# Save the big DataFrame to a new CSV file
output_file = os.path.join(csv_dir, 'combined_file.csv')
big_df.to_csv(output_file, index=False)

print(f'All files have been concatenated and saved to {output_file}')


All files have been concatenated and saved to combined_file.csv


In [9]:
#first714 = pd.read_csv("articles_first714.csv")
#ultimate_df = pd.concat([first714, big_df])

In [18]:
import pandas as pd
ultimate_df = pd.read_csv("llama_classified_articles.csv")

In [19]:
#ultimate_df.to_csv('llama_classified_articles.csv')

# simple validation just to make sure index in llama_classified is equal to index of the title

In [20]:
counter = 0
for index, row in ultimate_df.iterrows():
    
    if counter <= 350713:
        row_list =list(row['llama_classified'].strip('[]').split(','))[-1]
        #print(row['llama_classified'][-2], index)
        #print(int(row_list) != index)
        if int(row_list) != index:
            print(row_list,index)
    counter += 1