In [1]:
import os
import re
import json
import string

import pandas as pd
import numpy as np

from pprint import pprint

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords

stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

In [5]:
df = pd.read_csv('coleridgeinitiative/data.csv')
df.head()

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study


In [6]:
df.count()

Id               19661
pub_title        19661
dataset_title    19661
dataset_label    19661
cleaned_label    19661
dtype: int64

In [7]:
df.nunique()

Id               14316
pub_title        14271
dataset_title       45
dataset_label      130
cleaned_label      130
dtype: int64

In [8]:
with open('coleridgeinitiative/data/0008656f-0ba2-4632-8602-3017b44c2e90.json', 'r') as file:
    pprint(json.load(file))

[{'section_title': 'Abstract',
  'text': "In this study, Finnish ninth graders' and their school guidance "
          "counselors' views concerning ninth graders' perceptions of "
          'gender-appropriateness of occupations were examined. Special '
          'interest was placed on evaluating if ninth graders bring out any '
          'gender stereotypical perceptions regarding science, technology, '
          'engineering or mathematics (STEM) occupations. The data were '
          'gathered with the aid of an online survey (246 pupils) and '
          'semi-structured interviews (7 school guidance counselors). Ninth '
          'graders referred mostly to masculine physical dimension when '
          'justifying certain occupations being more suitable for men than for '
          'women. Respectively, they referred mostly to gender-typical '
          'interest when justifying certain occupations to be more suitable '
          'for women than for men. Boys presented more gender

In [9]:
df.loc[df["Id"] == '0008656f-0ba2-4632-8602-3017b44c2e90']

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label
7736,0008656f-0ba2-4632-8602-3017b44c2e90,Finnish Ninth Graders’ Gender Appropriateness ...,Trends in International Mathematics and Scienc...,Trends in International Mathematics and Scienc...,trends in international mathematics and scienc...


In [10]:
data_path = 'coleridgeinitiative/data'

In [11]:
def clean_text(txt):
     return re.sub('[^A-Za-z0-9.]+', ' ', str(txt).lower())

In [12]:
MAX_LENGTH = 64
OVERLAP = 20
    
def shorten_sentences(sentences):
    """
    Sentences that have more than MAX_LENGTH words will be split
    into multiple sentences with overlappings.
    """
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

In [13]:
def preprocess_text(text):
    # Clean and shorten the text, remove short sentences
    sentences = [clean_text(sentence) for sentence in nltk.sent_tokenize(text)]
    sentences = shorten_sentences(sentences)
    return [sentence for sentence in sentences if len(sentence) > 10]

In [14]:
def contains_label(sentence, labels):
    # Check if the sentence contains any of the labels
    return any(re.search(label.lower(), sentence) for label in labels)

In [15]:
data = []
label_count = 0
empty_count = 0

for _, row in df.iterrows():
    # Load the data
    if os.path.exists(f"{data_path}/{row.Id}.json"):
        with open(f"{data_path}/{row.Id}.json") as f:
            json_data = json.load(f)

        sentences = preprocess_text(str(json_data))
        labels = [row.cleaned_label.lower(), row.dataset_label.lower(), row.dataset_title.lower()]

        balanced = False
        for sentence in sentences:
            if contains_label(sentence, labels):
                data.append((sentence, labels[0])) # use cleaned_label as the standard label
                label_count += 1
                balanced = True
            elif balanced:
                empty_count += 1
                balanced = False

print('Text with dataset:', label_count)
print('Text without dataset:', empty_count)


Text with dataset: 380
Text without dataset: 328


In [16]:
data[:5]

[('using data from the national education longitudinal study of 1988 i find that as parents education increases children in single mother families experience a lower boost in their achievement test scores likelihood of attending any post secondary schooling likelihood of completing a 4 year college degree and years of completed schooling relative to children living with both biological parents.',
  'national education longitudinal study'),
 (' section title data and methods text data for this analysis derive from the national education longitudinal study of 1988 nels 88 a nationally representative two stage stratified cluster sample representative of 1988 u.s. 8th graders.',
  'national education longitudinal study'),
 ('other empirical results from the national education longitudinal study show that same gender teaching increases the test scores of eighth grade children dee 2005b .',
  'national education longitudinal study'),
 ('again the intention here is not to engage in a comparat

In [17]:
train_df = pd.DataFrame(data, columns=['Sentence', 'Label'])
train_df.head()

Unnamed: 0,Sentence,Label
0,using data from the national education longitu...,national education longitudinal study
1,section title data and methods text data for ...,national education longitudinal study
2,other empirical results from the national educ...,national education longitudinal study
3,again the intention here is not to engage in a...,national education longitudinal study
4,similarly ludwig and miller 2007 using data fr...,national education longitudinal study


In [18]:
train_df.nunique()

Sentence    334
Label        43
dtype: int64

In [19]:
train_df.tail()

Unnamed: 0,Sentence,Label
375,importantly the sars cov 2 genome sequence fro...,sars cov 2 genome sequence
376,nine complete sars cov 2 genome sequences from...,sars cov 2 genome sequences
377,for example 80 of confirmed covid 19 cases up ...,sars cov 2 full genome sequences
378,the whole genome sequence of covid 19 has abou...,genome sequence of covid 19
379,section title methods text we used the public...,covid 19 image data collection


In [20]:
import pickle

In [21]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

# get cross validation to make sure model has
# gotten most of the patterns from the data correctly, 
# and its not picking up too much on the noise
k_fold = KFold(n_splits = 5)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
X = train_df['Sentence'].to_numpy()
y = train_df['Label'].to_numpy()

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

In [25]:
summary = pd.DataFrame(
    {
        'Model': [],
        'Accuracy': [],
        'CV': [],
        'Precision': [],
        'Recall': [],
        'F1': []
    }
)

In [26]:
# https://developers.google.com/machine-learning/guides/text-classification/step-3

In [27]:
vectorizer = TfidfVectorizer(
    min_df = 1,
    dtype = 'int32',
    strip_accents = 'unicode',
    analyzer = 'word',
    token_pattern = r'\w{1,}',
    ngram_range = (1, 4),
    use_idf = 1,
    smooth_idf = 1,
    sublinear_tf = 1,
    stop_words = stopwords_list
)

# Learn vocabulary from training texts and vectorize training texts
vectorizer = vectorizer.fit(list(X_train) + list(X_test))

# Transform Bag-of-Words textual data to numeric
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)



In [28]:
vectorizer.get_feature_names_out()[:20]

array(['0', '0 00', '0 00 1', '0 00 1 00', '0 01', '0 01 degree',
       '0 01 degree uniform', '0 01 set', '0 01 set zero', '0 01 x0',
       '0 01 x0 01', '0 18', '0 18 0', '0 18 0 global', '0 22',
       '0 22 high', '0 22 high tide', '0 30', '0 30 30', '0 30 30 global'],
      dtype=object)

In [29]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn import preprocessing

In [30]:
# select top 'k' of the vectorized features, limit on the number of features. We use the top 20K features
top_k = 20

selector = SelectKBest(f_classif, k = min(top_k, X_train.shape[1]))
selector.fit(X_train, y_train)
X_train = selector.transform(X_train).astype('float32')
X_test = selector.transform(X_test).astype('float32')

  f = msb / msw
  f = msb / msw


In [31]:
# use the LabelEncoder to convert text labels to integers, 0, 1, 2, etc.
encoder = preprocessing.LabelEncoder()

Since we have two different data set (`X_train` and `X_test`), we need to fit it on all of our data otherwise there might be some categories in the test set `X_test` that were not in the train set `X_train` and we will get errors.

In [32]:
encoder.fit(list(y_train) + list(y_test)) 
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

In [33]:
encoder.classes_

array(['adni', 'agricultural resource management survey',
       'agricultural resources management survey',
       'alzheimer s disease neuroimaging initiative adni ',
       'alzheimers disease neuroimaging initiative',
       'baccalaureate and beyond',
       'baltimore longitudinal study of aging',
       'baltimore longitudinal study of aging blsa ',
       'beginning postsecondary students', 'census of agriculture',
       'coastal change analysis program', 'common core of data',
       'covid 19 death data', 'covid 19 image data collection',
       'covid 19 open research dataset',
       'early childhood longitudinal study',
       'education longitudinal study', 'genome sequence of covid 19',
       'ibtracs',
       'international best track archive for climate stewardship',
       'national assessment of education progress',
       'national center for education statistics common core of data',
       'national education longitudinal study',
       'national science foundat

In [34]:
print(X_train)
X_train.shape

  (19, 13)	0.1026464
  (19, 14)	0.10976793
  (19, 15)	0.10976793
  (19, 16)	0.10976793
  (33, 0)	0.15661183
  (33, 1)	0.15661183
  (33, 2)	0.15661183
  (58, 3)	0.16028745
  (186, 4)	0.13137828
  (186, 5)	0.13137828
  (186, 6)	0.13137828
  (186, 7)	0.13137828
  (186, 17)	0.13137828
  (186, 18)	0.13137828
  (186, 19)	0.13137828
  (261, 12)	0.09072764
  (273, 8)	0.1919229
  (273, 9)	0.1919229
  (273, 10)	0.1919229
  (273, 11)	0.1919229


(304, 20)

In [35]:
print(y_train)
y_train.shape

[ 7 39 37  0  0  3  0  0  2  0  1 22  0  0  0 15  9  0  0  5 37  0  0 15
 32  6  9  0  0 42  0  3  0 24 37  0  0  4 16 38 39  6  0 32  0 29 26  1
  0  0 39 27 15  0  0  0  6 37  8  0  0 37 42 33 38  9 22 16  2  0  0  9
  0  0 35 41 37  0  0  0  0  0 34  0  0 38  0 37  1  3 42  0  0 41  7 18
  0 26  0  0  0  0  3  0  3  0  0  0 15 15 25  6  0 40 32  7  0  6  0  0
  3  0  0  0 32  0  0  0  0  0 37 37 32 41  0 32 22  3  0  3  1 25  0  0
 25  0 16  0  0  0  0 40 37  0  0  0  9  0  0  0  1  0  0  0 11 19  0  0
  0  0  0 18 16 20 34 37 27  0  0  0  3  0  3  3  3 20 30  0  9 15 37 37
  0  0  7  0  0  3  0  0  0 32  3  0  3 39  0  0  0  0 36  0 32  0  6  0
  0 36  0  3  3  0  3  3 10  0  0  0  0 22  0 37 41  0 22 39  3  1 37  6
  0  0 15  0  3  3  3  1  0  0 32  0  9  3  0  0 42 25 19  0 39 12  0  0
 37  0  0  0  0  0  0 13  0 17 15 15 22 37 37  0  0 28  3 18 22  0  0 15
  0  0  0  0  0  0  0  0  0  0  0 31 41  0 39  9]


(304,)

## Random Forest Classifier

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
# create a tokenization a modelling pipeline
RandomForestClassifier_model = RandomForestClassifier(
    n_estimators = 100,
    random_state = 123
)

In [38]:
# fit the pipeline to the training data
RandomForestClassifier_model.fit(X_train, y_train)

In [39]:
RandomForestClassifier_yhat_train = RandomForestClassifier_model.predict(X_train)
RandomForestClassifier_yhat_test = RandomForestClassifier_model.predict(X_test)

In [40]:
# train accuracy 
RandomForestClassifier_train_accuracy = accuracy_score(y_train, RandomForestClassifier_yhat_train) * 100
print('Train accuracy: {0}'.format(RandomForestClassifier_train_accuracy))

# test accuracy 
RandomForestClassifier_test_accuracy = accuracy_score(y_test, RandomForestClassifier_yhat_test) * 100
print('Test accuracy: {0}'.format(RandomForestClassifier_test_accuracy))

Train accuracy: 50.6578947368421
Test accuracy: 48.68421052631579


In [41]:
pickle.dump(RandomForestClassifier_model, open('RandomForestClassifier_model.sav', 'wb'))

In [42]:
RandomForestClassifier_CV = cross_val_score(
    RandomForestClassifier_model, 
    X_train, y_train, 
    cv = k_fold, 
    scoring = 'accuracy', 
    n_jobs = -1
).mean()

RandomForestClassifier_CV

0.4869398907103825

In [43]:
print('Model: RandomForestClassifier', '\n', classification_report(y_test, RandomForestClassifier_yhat_test))

Model: RandomForestClassifier 
               precision    recall  f1-score   support

           0       0.49      1.00      0.66        37
           1       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         5
          16       0.00      0.00      0.00         3
          20       0.00      0.00      0.00         2
          21       0.00      0.00      0.00         1
          23       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         0
          25       0.00      0.00      0.00         1
          29       0.00      0.00      0.00         1
          32       0.00      0.00      0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
summary.loc[0] = [
    'Random Forest Classifier',
    round(accuracy_score(y_test, RandomForestClassifier_yhat_test), 2),
    round(RandomForestClassifier_CV, 2),
    round(precision_score(y_test, RandomForestClassifier_yhat_test, average = 'macro'), 2), 
    round(recall_score(y_test, RandomForestClassifier_yhat_test, average = 'macro'), 2), 
    round(f1_score(y_test, RandomForestClassifier_yhat_test, average = 'macro'), 2)
]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
summary

Unnamed: 0,Model,Accuracy,CV,Precision,Recall,F1
0,Random Forest Classifier,0.49,0.49,0.02,0.05,0.03


## Support Vector Machine

Linear Support Vector Machine

In [46]:
from sklearn.linear_model import SGDClassifier

In [47]:
# create a tokenization a modelling pipeline
SGDClassifier_model = SGDClassifier(
    loss = 'hinge',
    penalty = 'l2',
    alpha = 1e-3,
    random_state = 123,
    max_iter = 5
)

# fit the pipeline to the training data
SGDClassifier_model.fit(X_train, y_train)



In [48]:
SGDClassifier_yhat_train = SGDClassifier_model.predict(X_train)
SGDClassifier_yhat_test = SGDClassifier_model.predict(X_test)

In [49]:
# train accuracy 
SGDClassifier_train_accuracy = accuracy_score(y_train, SGDClassifier_yhat_train) * 100
print('Train accuracy: {0}'.format(SGDClassifier_train_accuracy))

# test accuracy 
SGDClassifier_test_accuracy = accuracy_score(y_test, SGDClassifier_yhat_test) * 100
print('Test accuracy: {0}'.format(SGDClassifier_test_accuracy))

Train accuracy: 49.67105263157895
Test accuracy: 48.68421052631579


In [50]:
pickle.dump(SGDClassifier_model, open('SGDClassifier_model.sav', 'wb'))

In [51]:
SGDClassifier_CV = cross_val_score(
    SGDClassifier_model, 
    X_train, y_train,
    cv = k_fold,
    scoring = 'accuracy',
    n_jobs=-1
).mean()

SGDClassifier_CV



0.4869398907103825

In [52]:
print('Model: Support Vector Machine', '\n', classification_report(y_test, SGDClassifier_yhat_test))

Model: Support Vector Machine 
               precision    recall  f1-score   support

           0       0.49      1.00      0.66        37
           1       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         5
          16       0.00      0.00      0.00         3
          20       0.00      0.00      0.00         2
          21       0.00      0.00      0.00         1
          23       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         0
          25       0.00      0.00      0.00         1
          29       0.00      0.00      0.00         1
          32       0.00      0.00      0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
summary.loc[1] = [
    'Support Vector Machine',
    round(accuracy_score(y_test, SGDClassifier_yhat_test), 2),
    round(SGDClassifier_CV, 2),
    round(precision_score(y_test, SGDClassifier_yhat_test, average = 'macro'), 2), 
    round(recall_score(y_test, SGDClassifier_yhat_test, average = 'macro'), 2), 
    round(f1_score(y_test, SGDClassifier_yhat_test, average = 'macro'), 2)
]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
summary

Unnamed: 0,Model,Accuracy,CV,Precision,Recall,F1
0,Random Forest Classifier,0.49,0.49,0.02,0.05,0.03
1,Support Vector Machine,0.49,0.49,0.02,0.05,0.03


## Naive Bayes

MultinomialNB

In [55]:
from sklearn.naive_bayes import MultinomialNB

In [56]:
# create a tokenization a modelling pipeline
MultinomialNB_model = MultinomialNB() #model the text

# fit the pipeline to the training data
MultinomialNB_model.fit(X_train, y_train)

In [57]:
MultinomialNB_yhat_train = MultinomialNB_model.predict(X_train)
MultinomialNB_yhat_test = MultinomialNB_model.predict(X_test)

In [58]:
# train accuracy 
MultinomialNB_train_accuracy = accuracy_score(y_train, MultinomialNB_yhat_train) * 100
print('Train accuracy: {0}'.format(MultinomialNB_train_accuracy))

# test accuracy 
MultinomialNB_test_accuracy = accuracy_score(y_test, MultinomialNB_yhat_test) * 100
print('Test accuracy: {0}'.format(MultinomialNB_test_accuracy))

Train accuracy: 48.68421052631579
Test accuracy: 48.68421052631579


In [59]:
pickle.dump(MultinomialNB_model, open('MultinomialNB_model.sav', 'wb'))

In [60]:
MultinomialNB_CV = cross_val_score(
    MultinomialNB_model, 
    X_train, y_train,
    cv = k_fold, 
    scoring = 'accuracy', 
    n_jobs=-1
).mean()

MultinomialNB_CV

0.4869398907103825

In [61]:
print('Model: MultinomialNB', '\n', classification_report(y_test, MultinomialNB_yhat_test))

Model: MultinomialNB 
               precision    recall  f1-score   support

           0       0.49      1.00      0.65        37
           1       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         5
          16       0.00      0.00      0.00         3
          20       0.00      0.00      0.00         2
          21       0.00      0.00      0.00         1
          23       0.00      0.00      0.00         1
          25       0.00      0.00      0.00         1
          29       0.00      0.00      0.00         1
          32       0.00      0.00      0.00         1
          36       0.00      0.00      0.00         1
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
summary.loc[2] = [
    'Naive Bayes',
    round(accuracy_score(y_test, MultinomialNB_yhat_test), 2),
    round(MultinomialNB_CV, 2),
    round(precision_score(y_test, MultinomialNB_yhat_test, average = 'macro'), 2), 
    round(recall_score(y_test, MultinomialNB_yhat_test, average = 'macro'), 2), 
    round(f1_score(y_test, MultinomialNB_yhat_test, average = 'macro'), 2)
]

  _warn_prf(average, modifier, msg_start, len(result))


In [63]:
summary

Unnamed: 0,Model,Accuracy,CV,Precision,Recall,F1
0,Random Forest Classifier,0.49,0.49,0.02,0.05,0.03
1,Support Vector Machine,0.49,0.49,0.02,0.05,0.03
2,Naive Bayes,0.49,0.49,0.02,0.05,0.03


In [76]:
text = 'We use data from the Early Childhood Longitudinal Study – Birth Cohort (ECLS-B). This dataset follows a nationally representative sample of U.S. born children from birth in 2001 until the start of kindergarten, up to 2007. We use information from Waves 1, 2, 3, and 4, when children are approximately 9 months, 2 years, 4½ years, and 5½ years old, respectively. Of the ~7000 children that participated in Wave 4, we analyze the ~3800 that are children of U.S.-born non-Hispanic White mothers, U.S.-born Mexican heritage mothers, and foreign-born Mexican mothers.'

In [77]:
text

'We use data from the Early Childhood Longitudinal Study – Birth Cohort (ECLS-B). This dataset follows a nationally representative sample of U.S. born children from birth in 2001 until the start of kindergarten, up to 2007. We use information from Waves 1, 2, 3, and 4, when children are approximately 9 months, 2 years, 4½ years, and 5½ years old, respectively. Of the ~7000 children that participated in Wave 4, we analyze the ~3800 that are children of U.S.-born non-Hispanic White mothers, U.S.-born Mexican heritage mothers, and foreign-born Mexican mothers.'

In [78]:
# extract sentences
sentences = [clean_text(sentence) for sentence in nltk.sent_tokenize(str(text))]
sentences = shorten_sentences(sentences) # make sentences short
sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars

# predict
RandomForestClassifier_labels = RandomForestClassifier_model.predict(selector.transform(vectorizer.transform(sentences).toarray()))

# get labels
RandomForestClassifier_labels = encoder.inverse_transform(RandomForestClassifier_labels) 
print('RandomForestClassifier_label:', set(RandomForestClassifier_labels))
RandomForestClassifier_labels = set(RandomForestClassifier_labels)

RandomForestClassifier_labels

RandomForestClassifier_label: {'adni'}


{'adni'}