In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
import spacy
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
import unicodedata
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error
from bs4 import BeautifulSoup
import argparse
import cPickle
nlp = spacy.load('en_core_web_sm', disable=['ner'])
doc = nlp(u"I don't want parsed", disable=['parser','tag','entity'])
tokenizer = ToktokTokenizer()
stopword_list.remove('not')
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')


In [9]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

In [12]:
def remove_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]', '', text)
    return text

In [13]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [14]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [2]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True):
    
    normalized_corpus = []
    for doc in corpus:
        if html_stripping:
            doc = strip_html_tags(doc)
        if accented_char_removal:
            doc = remove_accented_chars(doc)  
        if text_lower_case:
            doc = doc.lower()
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)   
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        if text_lemmatization:
            doc = lemmatize_text(doc)   
        if special_char_removal:
            doc = remove_special_characters(doc)  
        doc = re.sub(' +', ' ', doc)
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)    
        normalized_corpus.append(doc)  
    return normalized_corpus

In [1]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files='IMDB-Dataset.csv')
base_url = 'https://huggingface.co/datasets/lhoestq/demo1/resolve/main/data/'
dataset = load_dataset('csv', data_files={'train': base_url + 'train.csv', 'test': base_url + 'test.csv'})

In [None]:
def random_forest_on_bow(dataset, extend_data=False, vocab_limit=5000, forest_size=100):
    assert dataset in ['IMDB-Dataset.csv']
    print "Loading"
    remove_stop = True
    extract_tokens = False
    if dataset == 'imdb':
        data_dir = IMDB_DATA_DIR
    else:
        print "ERROR: Unknown dataset %s" % dataset
        return
    data_file_name = data_dir + "/processed/data_extended%s_remove-stop%s_tokens%s.pkl" % (
        ("1" if extend_data else "0"),
        ("1" if remove_stop else "0"),
        ("1" if extract_tokens else "0")
    )
    data = cPickle.load(open(data_file_name, 'rb'))

    labeled_train, unlabeled_train, test = data[0], data[1], data[2]
    labeled_train_reviews = data[3] 
    unlabeled_train_reviews = data[4]  
    test_reviews = data[5] 

In [None]:
    print("bag of words")
    vectorizer = CountVectorizer(max_features=vocab_limit)
    train_data_features = vectorizer.fit_transform(labeled_train_reviews)
    train_data_features = train_data_features.toarray()
    print train_data_features
    print train_data_features.shape

    vocab = vectorizer.get_feature_names()
    print("Vocabulary size: %d" % len(vocab))

    dist = np.sum(train_data_features, axis=0)

In [None]:
    print("Training Random Forest(%d) classifier" % forest_size)
    forest = RandomForestClassifier(n_estimators=forest_size) 
    if dataset == 'IMDB-Dataset.csv':
        forest = forest.fit(train_data_features, labeled_train['train'])
    else:
        print( "ERROR: Unknown dataset %s" % dataset)
        return
    print ("done.")

    train_predictions = forest.predict(train_data_features)
    if dataset == 'imdb':
        mse = mean_squared_error(labeled_train['train'], train_predictions)
    else:
        print ("ERROR: Unknown dataset %s" % dataset)
        return
    print ("TRAINING model =", model_

In [None]:
    print ("Fitting test data")
    test_data_features = vectorizer.transform(test_reviews)
    test_data_features = test_data_features.toarray()
    print test_data_features
    print test_data_features.shape

    print("Making predictions on test data")
    test_predictions = forest.predict(test_data_features)
    print "done."

    print("Writting predictions to file")
    if dataset == 'IMDB-Dataset.csv':
        output = pd.DataFrame(data={"id": test['id'], "train": test_predictions})
    else:
        print("ERROR: Unknown dataset %s" % dataset)
        return
    file_name = "./ModelResponses/%s_forest%d_bow%d_%spredictions.csv"\
                % (dataset, forest_size, vocab_limit, ("extended_" if extend_data else ""))
    output.to_csv(file_name, index=False, quoting=3)
    print("done.")


In [None]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.register('type', 'bool', lambda v: v.lower() in ("yes", "true", "t", "1"))
    parser.add_argument(
        dataset = 'IMDB-Dataset.csv',
        help='Dataset to load and predict on.'
    )
    parser.add_argument(
        '--features_size',
        type=int,
        default=10000,
        help='Number of features per review = vocab size.'
    )
    parser.add_argument(
        '--forest_size',
        type=int,
        default=100,
        help='Number of classification trees in Random Forest algorithm.'
    )
    parser.add_argument(
        '--extend',
        type='bool',
        default=False,
        help='Flag to decide to load the extended data ('IMDB-Dataset.csv') or not.'
    )
    args = parser.parse_args()
    print('args:', args)

    print("Running RandomForest on BagOfWords features")
    random_forest_on_bow(args.dataset, extend_data=args.extend, vocab_limit=args.features_size, forest_size=args.forest_size)