In [None]:
import pandas as pd
import os
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser 
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_community.llms import Ollama
import sqlalchemy as sal
from sqlalchemy import create_engine
from sqlalchemy.engine import URL
import re
from dotenv import load_dotenv
load_dotenv()
import stanza
import warnings
warnings.filterwarnings("ignore")

# Creation of PERSUADE corpus

In [None]:
#https://github.com/scrosseye/persuade_corpus_2.0
df = pd.read_csv('persuade_2.0_human_scores_demo_id_github.csv', sep=',')

In [None]:
df_filtered = df
df_filtered = df_filtered[df_filtered['prompt_name'] != 'Exploring Venus']
df_filtered = df_filtered[df_filtered['prompt_name'] != 'Facial action coding system']

In [None]:
df_filtered.head()

In [None]:
df_filtered['prompt_name'].unique()

In [None]:
df_filtered['assignment'].unique()

In [None]:
df_filtered = df_filtered[df_filtered['holistic_essay_score'] >= 5]

In [None]:
df_filtered['word_count'].mean()

In [None]:
df_sample = pd.DataFrame()
for p in df_filtered['prompt_name'].unique():
    df_sample = pd.concat([df_sample, df_filtered[df_filtered['prompt_name'] == p].sample(6)])

In [None]:
for index, row in df_sample.iterrows():
    file_name = row["essay_id_comp"] + '_' + row["prompt_name"].replace("\"", "").replace("?", "") + '.txt'
    with open(file_name, "w", encoding='utf-8') as f:
        f.write(row["full_text"])

# Creation of LLM corpus

In [None]:
temperature = 0.7

In [None]:
input_dir = os.getcwd() + "/prompts"

In [None]:
topics = []
for file in os.listdir(input_dir):
    input_file = os.path.join(input_dir, file)
    # checking if it is a file
    if os.path.isfile(input_file):
        topic = file[ : -4]
        print(topic)
        with open(input_file, "r", encoding='utf-8') as f:
            text = f.read()
            topic = [topic, text]
            topics.append(topic)

In [None]:
print(topics[2])

In [None]:
def clean_text(text):
    
    text = text.replace("*","").replace("#","")
    
    return text

In [None]:
def invoke_llm(llm, name):
    
    for topic in topics:   
        
        file_name = name + "_" + topic[0] + ".txt"
        print(file_name)
       
        prompt = ChatPromptTemplate.from_template("""
            Your task is to write an argumentative essay on the following topic. Your aim is to persuade the reader. 
            You utilize critical thinking and logical reasoning in your argument to ensure discourse coherence and cohesion. 
            You make extensive use of complex, compound sentences. You do NOT use lists or bullet points.
            Topic: {topic}
        """)
    
        output_parser = StrOutputParser()    
        
        chain = prompt | llm | output_parser
        
        answer = chain.invoke({"topic": topic[1]})   

        answer = clean_text(answer)
        
        with open(file_name, "w", encoding='utf-8') as f:
            f.write(answer)
            

In [None]:
# CHATGPT API
llm = ChatOpenAI(
    model = "gpt-4",
    temperature = temperature,
    openai_api_key = os.getenv("OPENAI_API_KEY")
)
invoke_llm(llm, "ChatGPT")

In [None]:
# GEMINI API
llm = ChatGoogleGenerativeAI(
    model = "gemini-1.5-pro-latest",
    temperature = temperature,
    google_api_key = os.getenv("GOOGLE_API_KEY")
)
invoke_llm(llm, "Gemini")

In [None]:
# GEMMA OLLAMA COLAB
llm = Ollama(
    base_url = "https://ab9f-34-83-144-154.ngrok-free.app", 
    model="gemma2",
    temperature = temperature
)
invoke_llm(llm, "Gemma")

In [None]:
# MISTRAL API
llm = ChatMistralAI(
    model = "mistral-large-latest",
    temperature = temperature,
    api_key = os.getenv("MISTRAL_API_KEY")
)
invoke_llm(llm, "Mistral")

# Search for linking adverbials

In [None]:
connection_url = URL.create(
    "mssql+pyodbc",
    host=".",
    port=1433,
    database="argumentation",
    query={
        "driver": "ODBC Driver 18 for SQL Server",
        "Encrypt": "yes",
        "TrustServerCertificate": "yes",
        "TrustedConnection": "yes"
    },
)
engine = create_engine(connection_url)
conn = engine.connect()

In [None]:
query = 'SELECT * FROM dbo.MARKERS'
df = pd.read_sql(query, engine, index_col='id')

In [None]:
df = df[df['theory'] == 'Biber']

In [None]:
columns = ['marker_id', 'valid', 'author', 'file', 'construction_type', 'start_pos', 'end_pos', 'left_context', 'match', 'right_context']

In [None]:
def search_for_adverbials(input_dir):
    
    df_match = pd.DataFrame(columns = columns)
    for file in os.listdir(input_dir):
        input_file = os.path.join(input_dir, file)
        # checking if it is a file
        if os.path.isfile(input_file):
            with open(input_file, "r", encoding='utf-8') as f:
                if file.startswith(tuple(['ChatGPT','Claude','Gemini','Gemma','Llama','Mistral'])):
                    author = 'LLM'
                else:
                    author = 'Student'
    
                text = f.read()
                text = text.replace("\n", " ")
                for index, row in df.iterrows():
                    marker = row['search_marker']
                    if marker.strip() not in ['!', '?'] :
                        marker = row['search_marker']
                        pattern = r"\b(" + re.escape(marker)  + r")\b"
                        # Find all occurences
                        matches = re.finditer(pattern, text, re.IGNORECASE)
                        for m in matches:
                            start_pos = m.start() - 250
                            if start_pos < 0:
                                start_pos = 0
                            end_pos =  m.end();
                            df_m = pd.DataFrame.from_dict([{
                                'marker_id' : index, 
                                'valid' : 0,
                                'author' : author,
                                'file' : file,
                                'construction_type' : row['construction_type'],                                
                                'start_pos' : m.start(), 
                                'end_pos' : m.end(), 
                                'left_context' : text[start_pos : m.start()], 
                                'match' : m.group(),
                                'right_context' : text[m.end() : m.end() + 250]
                            }])
    
                            df_match = pd.concat([df_match, df_m], ignore_index=True)
                            
    return df_match


In [None]:
input_dir = os.getcwd() + "/LLM"
df_match_llm = search_for_adverbials(input_dir)

In [None]:
input_dir = os.getcwd() + "/Students"
df_match_stu = search_for_adverbials(input_dir)

In [None]:
len(df_match_llm)

In [None]:
len(df_match_stu)

In [None]:
df_match = pd.concat([df_match_llm, df_match_stu], ignore_index=True)

In [None]:
df_match.to_excel("linking_adverbials.xlsx", index = False)

# Parsing

In [None]:
def parse_files(input_dir, output_csv):

    nlp = stanza.Pipeline('en', download_method=None)
    columns = ['file_id', 'sent_id','id','text','lemma','upos','xpos','feats','head','deprel','start_char','end_char','ner','multi_ner', 'misc']
    parse_df = pd.DataFrame(columns = columns)
    
    for file in os.listdir(input_dir):        
        input_file = os.path.join(input_dir, file)   
        print(input_file)
        if os.path.isfile(input_file):
            with open(input_file, "r", encoding='utf-8') as f:
                file_df = pd.DataFrame(columns = columns)            
                text = f.read()         
                doc = nlp(text)
                dicts = doc.to_dict()
                for i, d in enumerate(dicts):
                    df = pd.DataFrame.from_dict(d)
                    df['file_id'] = file
                    df['sent_id'] = i
                    file_df = pd.concat([file_df,df])
                
                parse_df = pd.concat([parse_df, file_df])
                parse_df.to_csv(output_csv, index=False)
    

In [None]:
input_dir = os.getcwd() + "/LLM"
parse_files(input_dir, "parsed_llm.csv")

In [None]:
input_dir = os.getcwd() + "/Students"
parse_files(input_dir, "parsed_student.csv")