# Machine Learning: Text Classification Assignment

In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split as tts
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### Use the CategorizedPlaintextCorpusReader to import the AP_News corpus.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PATH = '/content/drive/MyDrive/Data/AP_News'

DOC_PATTERN = r'.*\.txt'
CAT_PATTERN = r'([\w_\s]+).*'

corpus = CategorizedPlaintextCorpusReader(PATH, DOC_PATTERN, cat_pattern = CAT_PATTERN)

### Create two separate lists - one containing the text from each document and another containing the category of each article in the corpus.

In [None]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]
docs[0]

'HONOLULU (AP) — The University of Hawaii seeks additional funds for student mental health services, scholarships and other items in a new supplementary budget request, officials said. The Board of Regents approved the fiscal year 2020-2021 supplemental operating budget of about $28 million Thursday, The Honolulu Star-Advertiser reported. The request will be submitted to the state Legislature and Democratic Gov. The university requested $2.6 million to hire 19 psychologists for the 10-campus system. The University of Hawaii Manoa has eight psychologists, the Hilo campus has three, and the West Oahu campus has 1.75 positions, while each community college has one position, said Allyson Tanouye, who coordinates mental health throughout the university system. "The national standard is one mental health professional per 1,000 to 1,500 students," Tanouye said. "If we add the 19 positions we will be up to one per 1,500. That\'s how low we are". The mental health funding would also expand prog

In [None]:
categories = [corpus.categories(fileid)[0] for fileid in corpus.fileids()]

In [None]:
print(categories)

['health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'health', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'spor

### Preprocess the corpus, ensuring to include the following steps.

- Word tokenize the documents.
- Lemmatize, stem, and lowercase all tokens.
- Remove punctuation and stop words.

In [None]:
def preprocess(docs):
  lemmatizer = WordNetLemmatizer()
  stemmer = SnowballStemmer('english')
  preprocessed = []

  for doc in docs:
    tokenized = word_tokenize(doc)

    cleaned = [stemmer.stem(lemmatizer.lemmatize(token.lower()))
              for token in tokenized
              if token.lower() not in stopwords.words('english')
              if token.isalpha()]
    
    untokenized = ' '.join(cleaned)

    preprocessed.append(untokenized)

  return preprocessed

In [None]:
preprocessed = preprocess(docs)

### Split the data into training and testing sets with the size of the test set being 30% of the records.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(preprocessed, categories, test_size=0.3, random_state=42)

### Construct a pipeline that TF-IDF vectorizes the text and trains a Random Forest classification model.

In [None]:
# count vect not necessary, but let's see what happens
model = Pipeline([('vect',CountVectorizer()),
                  ('tdidf',TfidfTransformer()),
                  ('clf',RandomForestClassifier(n_estimators=100))
                  ])



In [None]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

### Generate predictions on the test set and print a classification report to evaluate how well the model performed.

In [None]:
predictions = model.predict(X_test)

In [None]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

      health       0.79      0.69      0.73        16
    politics       0.90      0.90      0.90        10
      sports       1.00      0.80      0.89        15
        tech       0.65      0.87      0.74        15

    accuracy                           0.80        56
   macro avg       0.83      0.81      0.82        56
weighted avg       0.83      0.80      0.81        56



### Perform 10-fold cross validation and obtain the averge F1 score across all the folds.

In [None]:
f1_scores = cross_val_score(model, preprocessed, categories, cv=10, scoring='f1_macro')

f1_scores

array([0.8540404 , 0.83888889, 0.68903319, 1.        , 0.84155844,
       0.66043956, 0.84848485, 0.70064935, 0.63992674, 0.71016484])

In [None]:
f1_scores.mean()

0.7783186258186259

In [None]:
f1_scores.std()

0.10916266855733489

### Ingest, preprocess, and predict the topic of the article at the following URL.

In [None]:
url = 'https://www.nytimes.com/2019/11/25/business/uber-london.html'

In [None]:
import requests
from bs4 import BeautifulSoup

def get_url_text(url):
  response = requests.get(url)
  content = response.text

  TAGS = ['h1','h2','h3','h4','h5','h6','h7','p','li']
  soup = BeautifulSoup(content, 'lxml')
  text_list = [tag.get_text() for tag in soup.find_all(TAGS)]
  text = ' '.join(text_list)
  return text.strip().replace('\n', '')  

In [None]:
raw_text = get_url_text(url)
cleaned = (preprocess([raw_text]))
cleaned

['advertis support uber fight surviv london lose licens compani odd regul driver tradit cab lucrat european market adam satariano ami tsang london uber suffer major blow monday london transport author made surpris decis extend taxi oper licens persist safeti problem throw question whether compani continu oper lucrat european market decis immedi affect uber presenc london street compani said would appeal decis set could long legal process uber continu oper throughout time news add difficult year compani stage disappoint initi public offer may uber sinc experienc execut turnov sever round layoff face continu public scrutini safeti passeng transport london regul taxi privat hire servic citi said uber meet fit proper standard need hold taxi licens regul said uber pattern failur place passeng safeti risk includ vulner uber app allow unauthor driver carri thousand rider clear concern issu aros also concern confid similar issu happen helen chapman director licens regul charg transport london 

In [None]:
model.predict(cleaned)[0]

'tech'