In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from pathlib import Path
from ollama import chat
from ollama import ChatResponse
import json

models = ['qwen3:8b','qwen3:30b']

In [2]:
prefix = 'google_scholar/'
path = prefix+ 'data'
 

lst_dfs = []
for model in models :
    model_name = model.replace(':','_')
    df_searchterms = pd.read_csv(path+'/google_scholar_search_base_article_detail_txt.csv',sep =';')
    df_searchterms['output_folder'] = path +'/score/'+ model_name+'/'+df_searchterms['folder']
    df_searchterms['output_file'] =  df_searchterms['output_folder']+'/'+df_searchterms['article_file'].str.replace('.html','.txt').str.replace('.pdf','.txt')
    df_searchterms['input_file'] = prefix+df_searchterms['path_txt_article']
    df_searchterms['author']= df_searchterms['author_div']
    df_searchterms['url'] = df_searchterms['target_url']
    df_searchterms['page'] = (df_searchterms['start']/10).astype('int')
    df_searchterms['year'] = df_searchterms['author'].str.extract(r',\s*(\d{4})\s*-')
    df_searchterms['citations'] = df_searchterms['citations']
    df_searchterms = df_searchterms[['output_folder','output_file','input_file','title','author','year','page','citations','url']]
    df_searchterms['origin'] = 'google_scholar'
    df_searchterms['model']  = model
    lst_dfs.append(df_searchterms)

df_to_process_google = pd.concat(lst_dfs, axis =0, ignore_index=True)


In [3]:
prefix = 'medium/'
path = prefix+ 'data'

lst_dfs = []
for model in models :
    model_name = model.replace(':','_')
    df_searchterms = pd.read_csv(path+'/medium_search_base_detail_txt.csv',sep =';')
    df_searchterms['input_file']=prefix+df_searchterms['path_txt_article'].str.replace('\\','/',regex=False)
    df_searchterms['output_folder'] = path +'/score/'+ model_name+'/'+df_searchterms['folder']
    df_searchterms['output_file'] =  df_searchterms['output_folder']+'/'+df_searchterms['algoliaObjectId']+'.txt'
    df_searchterms['author'] = df_searchterms['creator'].str.extract(r"'name':\s*'([^']+)'")
    df_searchterms['year'] = df_searchterms['firstPublishedAt'].map(lambda x: pd.to_datetime(x, unit='ms').year)
    df_searchterms['citations'] = df_searchterms['voterCount']
    df_searchterms['page'] = None
    df_searchterms['url'] = df_searchterms['mediumUrl']
    df_searchterms = df_searchterms[['output_folder','output_file','input_file','title','author','year','page','citations','url']]
    df_searchterms['origin'] = 'medium'
    df_searchterms['model']  = model
    lst_dfs.append(df_searchterms)

df_to_process_medium = pd.concat(lst_dfs, axis =0, ignore_index=True)

In [68]:
df_to_process = pd.concat([df_to_process_google,df_to_process_medium], axis = 0, ignore_index=True)

In [70]:
def get_result(record):

    output_folder_path = Path(record['output_folder'])
    output_folder_path.mkdir(parents=True, exist_ok=True)

    input_file_path = Path(record['input_file'])
    output_file_path = Path(record['output_file'])

    if not(input_file_path.exists() and input_file_path.is_file()):  
        return False, None

    if output_file_path.exists() and output_file_path.is_file():  
    
        f= open(record['output_file'], "r", encoding="utf-8") 
        document = f.read()
        f.close()
    
        return False, document
    
    return True, None

df_to_process['result'] = df_to_process.apply(get_result, axis = 1)
df_to_process['need_processing'] = df_to_process['result'].map(lambda x : x[0])
df_to_process['output_content'] = df_to_process['result'].map(lambda x : x[1])
df_to_process.drop(columns='result', inplace=True)

ix = df_to_process['output_content'].notnull()
df_to_process.loc[ix,'score'] = df_to_process.loc[ix,'output_content'].str.split('</think>')\
                            .map(lambda x : x[-1]).astype('string').str.extract(r'<SCORE>(\d+)</SCORE>').iloc[:, 0].astype('float64')
df_to_process = df_to_process.drop_duplicates(ignore_index=True)

In [72]:
df_to_process_first_step = df_to_process[df_to_process['model'] == 'qwen3:8b'].reset_index(drop=True)
df_to_process_second_step = df_to_process[df_to_process['model'] == 'qwen3:30b'].reset_index(drop=True)
lst_files_second_step = df_to_process_first_step.loc[df_to_process_first_step['score'] >= 80, 'input_file'].values
df_to_process_second_step = df_to_process_second_step[df_to_process_second_step['input_file'].isin(lst_files_second_step)].copy()
df_to_process = pd.concat([df_to_process_first_step, df_to_process_second_step], axis = 0 , ignore_index=True)
df_to_process = df_to_process[df_to_process['need_processing']].reset_index(drop=True)

In [79]:
df = df_to_process_second_step[df_to_process_second_step['score']>=80].drop(columns=['output_folder','output_file','input_file','need_processing','model']).to_csv('summary.csv', index = False, sep = ';')

In [7]:
df_to_process_second_step[df_to_process_second_step['output_content'].notnull()].shape[0]

973

In [8]:
df_to_process_second_step[df_to_process_second_step['score']>=80].shape[0]

444

In [9]:
records = df_to_process.to_dict(orient = 'records')

In [10]:
f= open("prompt_llm.xml", "r", encoding="utf-8") 
prompt = f.read()
f.close()


In [11]:
for record in tqdm(records):

    f= open(record['input_file'], "r", encoding="utf-8") 
    document = f.read()
    f.close()

    
    response: ChatResponse = chat(model= record['model'], messages=[
    {
        'role': 'user',
        'content': prompt.format(document = document),
    },
    ])

    content = response['message']['content']
    with open(record['output_file'], "w", encoding="utf-8") as f:
        f.write(content)

    
    
    



0it [00:00, ?it/s]