<a href="https://colab.research.google.com/github/yoyodahary/ReaserchProject/blob/master/Human_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import random
import re
import h5py
import math
import numpy as np
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tabulate import tabulate
import os
import time
import psutil
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from multiprocessing import Process
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import re
import json


In [2]:
root = ""
data_path = root + 'data/'
tfidf_path = data_path + "tf-idf/"
vectors_path = data_path + "vectors/"
classifications_path = data_path + "classifications/"
results_path = data_path + "results/"

df = pd.read_csv(data_path+"AI_Human_cleaned.csv", usecols = ["text","prompt_name","source","label"])

#df = df.sample(n=1000, random_state=42)

train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)

folder_path = data_path
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

train_df.to_csv(data_path+'train_dataset.csv',index=False)
train_df.to_hdf(data_path+'train_dataset.h5', key='train_dataset', mode='w')

test_df.to_csv(data_path+'test_dataset.csv',index=False)
test_df.to_hdf(data_path+'test_dataset.h5', key='test_dataset', mode='w')

In [3]:
SPECIAL_CHARCATERS_REMOVAL=r"\b\w+\b"
TOKEN_PATTERN="[^ \n]+"

# Stop Words
nltk.download('stopwords')
STOP_WORDS = stopwords.words('english')
STOP_WORDS += [word.capitalize() for word in STOP_WORDS]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mehke\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:

# Abbreviations
ABBREVIATIONS = pd.read_excel(data_path+'abbreviations_eng.xls')
ABBREVIATIONS['abbr'] = ABBREVIATIONS['abbr'].astype(str)
ABBREVIATIONS_lowercased = ABBREVIATIONS.copy()
ABBREVIATIONS_lowercased['abbr'] = ABBREVIATIONS_lowercased['abbr'].str.lower()
ABBREVIATIONS_lowercased['long'] = ABBREVIATIONS_lowercased['long'].str.lower()
ABBREVIATIONS = pd.concat([ABBREVIATIONS, ABBREVIATIONS_lowercased], ignore_index=True)
ABBR_PATTERN = r'\b(?:' + '|'.join(map(re.escape, ABBREVIATIONS['abbr'])) + r')\b'
def expand_abbreviations(text):
    def replace(match):
        return ABBREVIATIONS.loc[ABBREVIATIONS['abbr'] == match.group(0), 'long'].iloc[0]

    return re.sub(ABBR_PATTERN, replace, text)

In [4]:
def read_texts(train_or_test):
  """
  reads the data from the train tweets sheet
  :return: the data as pandas data set
  """

  store = pd.HDFStore(data_path+f'{train_or_test}_dataset.h5')
  df = store.select(f'{train_or_test}_dataset')
  store.close()

  columns_to_include = ["text"]
  df = df[columns_to_include]
  df.dropna(subset=["text"], inplace=True)
  return df

def read_classifications(train_or_test="train", classification = "label"):
  """
  reads the data from the train tweets sheet
  :return: the data as pandas data set
  """

  store = pd.HDFStore(data_path+f'{train_or_test}_dataset.h5')
  df = store.select(f'{train_or_test}_dataset')
  store.close()

  columns_to_include = [classification]
  df = df[columns_to_include]
  df.dropna(subset=[classification], inplace=True)
  return df


def get_terms():
  columns= ["nt", "f", "tf","idf","tfidf"]
  meanings= ["Number of different documents that the word appears in.",
   "Number of appearances of the word in all documents.",
   "Term frequency.",
   "Inverse document frequency.",
   "Term fruquency multiplied by inverse document frequency."]
  return columns, meanings


def write_tf_idf_chart(preprocessing,path = tfidf_path):
  train_text = read_texts("train")
  vectorizer = VECTORIZERS[preprocessing]
  if "L" in preprocessing and "O" in preprocessing:
      train_text["text"] = "LowerCode " + train_text["text"]
  elif "L" in preprocessing:
      train_text["text"] = train_text["text"].str.lower()


  sparse_matrix = vectorizer.fit_transform(train_text["text"])
  dense_matrix = sparse_matrix.toarray()
  sig_f = sparse_matrix.sum()

  print(f"Me: sig_f is:{sig_f}")

  # Get the vocabulary from the vectorizer object
  vocab = vectorizer.get_feature_names_out()
  df = pd.DataFrame(columns=['word', 'nt', 'f', 'tf', 'idf', 'tf-idf'])


  print(f"calculating the tfidf table for {preprocessing}:")
  for word , index in tqdm(vectorizer.vocabulary_.items()):
      # print(word)
      # The column of tf-idf values of the specific word, as the rows are the tweets
      word_column=dense_matrix[:, index]
      # Count the number of texts the word appears in
      nt = np.count_nonzero(word_column)
      # Count the number of times the word is used in all tweets
      f = np.sum(word_column)
      # term frequency in a document compared to the number of terms in the corpus
      tf = f / sig_f
      # shape[0] is the number of rows, aka the number of texts
      n = dense_matrix.shape[0]
      # Calculate the idf value
      idf =np. log(n / (nt + 1))
      # Calculate the tf-idf value
      tf_idf = tf * idf
      # append all the data we collected so far to each word to the data frame
      new_df = pd.DataFrame({'word': word, 'nt': nt, 'f': f, 'tf': tf, 'idf': idf, 'tf-idf': tf_idf}, index=[0])

      df = pd.concat([df, new_df], ignore_index=True)
  print("done!")
  df = df.sort_values('tf-idf', ascending=False)

  df.to_csv(path+f"tfidf_table_{preprocessing}.csv")

In [5]:
NT = 1  # the minimum nuber of tweets a word in tf idf should show
NUMBER_OF_WORDS = [1000,2000,3000,4000,5000]  # the number of the highest tfidf words
VECTORIZERS={
    # the tf-idf vectorizers by preprocessing method
    #  'N' : CountVectorizer(min_df=NT,lowercase=False, token_pattern=TOKEN_PATTERN),#None, baseline

    #  'S':CountVectorizer(min_df=NT,lowercase=False, token_pattern=TOKEN_PATTERN,stop_words=STOP_WORDS),#Stop word removal
    #  'C':CountVectorizer(min_df=NT,lowercase=False, token_pattern=SPECIAL_CHARCATERS_REMOVAL),#special Characters removal
    #  'O':CountVectorizer(min_df=NT,lowercase=False, token_pattern=TOKEN_PATTERN, preprocessor=expand_abbreviations),#Open abbreviations
    # # '3gram' : CountVectorizer(min_df=NT,lowercase=False, analyzer='char', ngram_range=(3, 3)),
    # # '4gram' : CountVectorizer(min_df=NT,lowercase=False, analyzer='char', ngram_range=(4, 4)),
     'L' : CountVectorizer(min_df=NT,lowercase=True, token_pattern=TOKEN_PATTERN),# Lowercase

    # # pairing preprocssing methods
    'SC':CountVectorizer(min_df=NT,lowercase=False, token_pattern=SPECIAL_CHARCATERS_REMOVAL,stop_words=STOP_WORDS),
    # 'SO':CountVectorizer(min_df=NT,lowercase=False, token_pattern=TOKEN_PATTERN,stop_words=STOP_WORDS,preprocessor=expand_abbreviations),
    'SL':CountVectorizer(min_df=NT,lowercase=True, token_pattern=TOKEN_PATTERN,stop_words=STOP_WORDS),
    # 'CO':CountVectorizer(min_df=NT,lowercase=False, token_pattern=SPECIAL_CHARCATERS_REMOVAL,preprocessor=expand_abbreviations),
    'CL':CountVectorizer(min_df=NT,lowercase=True, token_pattern=SPECIAL_CHARCATERS_REMOVAL),
    # 'OL':CountVectorizer(min_df=NT,lowercase=True, token_pattern=TOKEN_PATTERN,preprocessor=expand_abbreviations),

    # # Trio preprocessing methods
    # 'SCO':CountVectorizer(min_df=NT,lowercase=False, token_pattern=SPECIAL_CHARCATERS_REMOVAL,stop_words=STOP_WORDS,preprocessor=expand_abbreviations),
    # 'COL':CountVectorizer(min_df=NT,lowercase=True, token_pattern=SPECIAL_CHARCATERS_REMOVAL,preprocessor=expand_abbreviations),
    # 'SOL':CountVectorizer(min_df=NT,lowercase=True, token_pattern=TOKEN_PATTERN,stop_words=STOP_WORDS, preprocessor=expand_abbreviations),
    'SCL':CountVectorizer(min_df=NT,lowercase=True, token_pattern=SPECIAL_CHARCATERS_REMOVAL,stop_words=STOP_WORDS),

    # # # All preprocessing
    # 'SCOL':CountVectorizer(min_df=NT,lowercase=True, token_pattern=SPECIAL_CHARCATERS_REMOVAL,stop_words=STOP_WORDS,preprocessor=expand_abbreviations)
}

In [None]:

#itirate over each preprocessing method and writes the tfidf chart
for preprocessing, vectorizer in VECTORIZERS.items():
  print("------------------------------------------")
  print(f"writing the {preprocessing} tf-idf chart:")
  write_tf_idf_chart(preprocessing)
  print("------------------------------------------")

In [6]:
CLASSIFY = ['label']

def write_classification(text, classification, set):
    classified_text_df = pd.DataFrame(columns=["class: "+classification])
    if classification == 'label':
        classified_text_df["class: "+classification] = text["label"].apply(lambda x: 0 if x == 0 else 1)
    folder_path = 'classification/'
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    classified_text_df.to_csv(classifications_path+f'{set}_classification_{classification}.csv', index=False)    
    classified_text_df.to_hdf(classifications_path+f'{set}_classification_{classification}.h5', key='classification', mode='w')

def write_vectors(tfidf_vocabulary,vectors,set,amount,preprocessing):
  dense_vectors = vectors.toarray()
  df = pd.DataFrame(dense_vectors, columns=tfidf_vocabulary)
  folder_path = 'vectors/'
  if not os.path.exists(folder_path):
        os.makedirs(folder_path)
  df.to_csv(f'vectors/{preprocessing}_{set}_vectors_{amount}.csv',index=False)
  df.to_hdf(f'vectors/{preprocessing}_{set}_vectors_{amount}.h5', key='vectors', mode='w')

def get_words(amount,preprocessing):
  words = pd.read_csv(tfidf_path+f'tfidf_table_{preprocessing}.csv')
  words = words.head(amount)
  words = words['word'].tolist()
  return words

In [None]:
train_texts = read_texts("train")
test_texts = read_texts("test")
for amount in NUMBER_OF_WORDS:
  print(amount)
  VECTORIZERS={ # the tf-idf vectorizers by preprocessing method
    'N' : TfidfVectorizer(vocabulary=get_words(amount,'N'),min_df=NT,lowercase=False, token_pattern=TOKEN_PATTERN),#None, baseline
    
    'S':TfidfVectorizer(vocabulary=get_words(amount,'S'),min_df=NT,lowercase=False, token_pattern=TOKEN_PATTERN,stop_words=STOP_WORDS),#Stop word removal
    'C':TfidfVectorizer(vocabulary=get_words(amount,'C'),min_df=NT,lowercase=False, token_pattern=SPECIAL_CHARCATERS_REMOVAL),#special Characters removal
    # 'O':TfidfVectorizer(vocabulary=get_words(amount,'O'),min_df=NT,lowercase=False, token_pattern=TOKEN_PATTERN, preprocessor=expand_abbreviations),#Open abbreviations
    'L' : TfidfVectorizer(vocabulary=get_words(amount,'L'),min_df=NT,lowercase=True, token_pattern=TOKEN_PATTERN),# Lowercase
    
    # pairing preprocssing methods
    'SC':TfidfVectorizer(vocabulary=get_words(amount,'SC'),min_df=NT,lowercase=False, token_pattern=SPECIAL_CHARCATERS_REMOVAL,stop_words=STOP_WORDS),
    # 'SO':TfidfVectorizer(vocabulary=get_words(amount,'SO'),min_df=NT,lowercase=False, token_pattern=TOKEN_PATTERN,stop_words=STOP_WORDS,preprocessor=expand_abbreviations),
    'SL':TfidfVectorizer(vocabulary=get_words(amount,'SL'),min_df=NT,lowercase=True, token_pattern=TOKEN_PATTERN,stop_words=STOP_WORDS),
    # 'CO':TfidfVectorizer(vocabulary=get_words(amount,'CO'),min_df=NT,lowercase=False, token_pattern=SPECIAL_CHARCATERS_REMOVAL,preprocessor=expand_abbreviations),
    'CL':TfidfVectorizer(vocabulary=get_words(amount,'CL'),min_df=NT,lowercase=True, token_pattern=SPECIAL_CHARCATERS_REMOVAL),
    # 'OL':TfidfVectorizer(vocabulary=get_words(amount,'OL'),min_df=NT,lowercase=True, token_pattern=TOKEN_PATTERN,preprocessor=expand_abbreviations),
    
    # Trio preprocessing methods
    # 'SCO':TfidfVectorizer(vocabulary=get_words(amount,'SCO'),min_df=NT,lowercase=False, token_pattern=SPECIAL_CHARCATERS_REMOVAL,stop_words=STOP_WORDS,preprocessor=expand_abbreviations),
    # 'COL':TfidfVectorizer(vocabulary=get_words(amount,'COL'),min_df=NT,lowercase=True, token_pattern=SPECIAL_CHARCATERS_REMOVAL,preprocessor=expand_abbreviations),
    # 'SOL':TfidfVectorizer(vocabulary=get_words(amount,'SOL'),min_df=NT,lowercase=True, token_pattern=TOKEN_PATTERN,stop_words=STOP_WORDS, preprocessor=expand_abbreviations),
    'SCL':TfidfVectorizer(vocabulary=get_words(amount,'SCL'),min_df=NT,lowercase=True, token_pattern=SPECIAL_CHARCATERS_REMOVAL,stop_words=STOP_WORDS),
    
    # All preprocessing
    # 'SCOL':TfidfVectorizer(vocabulary=get_words(amount,'SCOL'),min_df=NT,lowercase=True, token_pattern=SPECIAL_CHARCATERS_REMOVAL,stop_words=STOP_WORDS,preprocessor=expand_abbreviations)
}
  for preprocessing, vectorizer in tqdm(VECTORIZERS.items()):
    tfidf_vocabulary=get_words(amount,preprocessing)
    train_vectors = vectorizer.fit_transform(train_texts["text"])
    write_vectors(tfidf_vocabulary,train_vectors,"train",amount,preprocessing)

    test_vectors = vectorizer.fit_transform(test_texts["text"])
    write_vectors(tfidf_vocabulary,test_vectors,"test",amount,preprocessing)

In [16]:
LABELS = df['label'].to_numpy()

OVERSAMPLERS = {
    'ROS': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42),
    'ADASYN': ADASYN(random_state=42),
    'NONE': None
}

ML_CLASSIFIERS = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'SVC': SVC(random_state=42),
    'MLP': MLPClassifier(random_state=42),
    'MultinomialNB': MultinomialNB(),
    'LogisticRegression': LogisticRegression(random_state=42)
}





In [9]:
def load_vectors(set,amount,preprocessing):
  vectors = pd.read_hdf(vectors_path+f'{preprocessing}_{set}_vectors_{amount}.h5')
  return vectors



def oversample(oversampler, amount, preprocessing):
    train_vectors = load_vectors("train",amount,preprocessing)
    train_labels = read_classifications("train")

    oversampler = OVERSAMPLERS[oversampler]

    if oversampler is not None:
        train_vectors, train_labels = oversampler.fit_resample(train_vectors, train_labels)

    return train_vectors, train_labels

def classify(x_train, y_train, x_test, y_test, classifier):
    classifier = ML_CLASSIFIERS[classifier]
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    return y_pred

def evaluate(y_test, y_pred):
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    results = {
        'f1': f1,
        'precision': precision,
        'accuracy': accuracy,
        'recall': recall,
        'roc_auc': roc_auc
    }

    return results


In [17]:
for amount in NUMBER_OF_WORDS:
  for preprocessing in VECTORIZERS.keys():
    for oversampler in OVERSAMPLERS.keys():
      for classifier in ML_CLASSIFIERS.keys():
        print(f"amount: {amount}, preprocessing: {preprocessing}, oversampler: {oversampler}, classifier: {classifier}")
        train_vectors, train_labels = oversample(oversampler, amount, preprocessing)
        test_vectors = load_vectors("test",amount,preprocessing)
        test_labels = read_classifications("test")

        y_pred = classify(train_vectors, train_labels, test_vectors, test_labels, classifier)
        results = evaluate(test_labels, y_pred)

        file_name = f'{classifier}.json'

        path_to_save = results_path+f'{preprocessing}/{amount}/{oversampler}/'+file_name

        # Save the results
        with open(path_to_save, 'w') as f:
            json.dump(results, f)
            



        

amount: 1000, preprocessing: N, oversampler: ROS, classifier: RandomForest


  return fit_method(estimator, *args, **kwargs)


amount: 1000, preprocessing: N, oversampler: ROS, classifier: SVC


  y = column_or_1d(y, warn=True)


amount: 1000, preprocessing: N, oversampler: ROS, classifier: MLP


  y = column_or_1d(y, warn=True)


amount: 1000, preprocessing: N, oversampler: ROS, classifier: MultinomialNB


  y = column_or_1d(y, warn=True)


amount: 1000, preprocessing: N, oversampler: ROS, classifier: LogisticRegression


  y = column_or_1d(y, warn=True)


amount: 1000, preprocessing: N, oversampler: SMOTE, classifier: RandomForest


  return fit_method(estimator, *args, **kwargs)


amount: 1000, preprocessing: N, oversampler: SMOTE, classifier: SVC


  y = column_or_1d(y, warn=True)


In [12]:
if not os.path.exists(results_path):
    os.makedirs(results_path)
for preprocessing in VECTORIZERS.keys():
  folder = results_path + preprocessing
  if not os.path.exists(folder):
    os.makedirs(folder)
  for amount in NUMBER_OF_WORDS:
    amount_folder = folder + '/' + str(amount)
    if not os.path.exists(amount_folder):
      os.makedirs(amount_folder)
      for oversampler in OVERSAMPLERS:
        oversampler_folder = amount_folder + '/' + oversampler
        if not os.path.exists(oversampler_folder):
          os.makedirs(oversampler_folder)