In [9]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
import pickle

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

from sklearn import set_config; set_config(display='diagram')


In [6]:
def get_huggingface_texts(number):
    path = "../../raw_data/huggingface.co_human_ai_generated_text/model_training_dataset.csv"

    # Extract a small dataset
    chunksize = number
    huggingface_df = None

    for chunk in pd.read_csv(path, chunksize=chunksize):
        # chunk is a DataFrame. To "process" the rows in the chunk:
        huggingface_df = chunk
        break

    huggingface_human_text_df = huggingface_df[["id", "human_text"]]
    huggingface_human_text_df = huggingface_human_text_df.rename(columns={'human_text':'text'},)
    huggingface_human_text_df['generated'] = 0

    huggingface_ai_text_df = huggingface_df[["id", "ai_text"]]
    huggingface_ai_text_df = huggingface_ai_text_df.rename(columns={'ai_text':'text'},)
    huggingface_ai_text_df['generated'] = 1

    huggingface_sample_text_df = pd.concat(objs=[huggingface_human_text_df, huggingface_ai_text_df])
    return huggingface_sample_text_df

In [10]:
df = get_huggingface_texts(1000)

In [19]:
# https://github.com/PrithivirajDamodaran/Gramformer
# pip install -U git+https://github.com/PrithivirajDamodaran/Gramformer.git
# python -m spacy download en_core_web_sm

from gramformer import Gramformer
import torch

def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(1212)

gf = Gramformer(models = 1, use_gpu=False) # 1=corrector, 2=detector

for index, row in df.iterrows():
    text = row['text']
    generated = row['generated']

    for sentence in text.split("."):
        sentence = sentence.strip()
        sentence = f"{sentence}."
        corrected_sentences = gf.correct(f"{sentence}.", max_candidates=1)
        corrected_sentence = list(corrected_sentences)[0]
        if sentence == corrected_sentence:
            print(f"✅ {generated}\t")
        else:
            print(f"❌ {generated}\t")
            print(f"{sentence}")
            print(f"{corrected_sentence}")




[Gramformer] Grammar error correct/highlight model loaded..
✅ 0	
❌ 0	
Some school have decreased bullying and high and middle school because some students get bullied.
Some schools have decreased bullying in high and middle school because some students get bullied.
❌ 0	
Some Schools offter distance learning as an option for students to attend classes from home by way of online or video conferencing.
Some schools offer distance learning as an option for students to attend classes from home by way of online or video conferencing.
❌ 0	
Students can ncreased to learn at home.
Students can continue to learn at home.
❌ 0	
Also is more hard to students understand by online.
Also is more difficult for students to understand online.
❌ 0	
students get distract at home.
students get distracted at home.
❌ 0	
Some schools in United States ofter classes from home because is good option to students.
Some schools in United States offer classes from home because it is a good option for students.
❌ 0	
A

KeyboardInterrupt: 

In [29]:
import spacy
import contextualSpellCheck

nlp = spacy.load('en_core_web_sm')
contextualSpellCheck.add_to_pipe(nlp)
doc = nlp('This is my neww job. My firsrt neme is Jack.')

print(len(doc._.suggestions_spellCheck)) # => Number of errors: 3
print(doc._.suggestions_spellCheck)      # => {neww: 'new', firsrt: 'best', neme: 'name'}
print(doc._.outcome_spellCheck)          # => This is my new job. My best name is Jack.

3
{neww: 'new', firsrt: 'best', neme: 'name'}
This is my new job. My best name is Jack.


In [33]:
for index, row in df.iterrows():
    text = row['text']
    generated = row['generated']

    errors_nb = 0
    for sentence in text.split(". "):
        doc = nlp(f"{sentence}.")
        errors_nb += len(doc._.suggestions_spellCheck)

    print(f"{errors_nb}\t")


8	
8	
22	
1	
0	
6	
10	
12	
10	
17	
8	
15	


KeyboardInterrupt: 