In [1]:
! pip install numpy pandas scikit-learn matplotlib



### Predict articles with the Tuned Random Forest with SVD

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.feature_extraction import text
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


train_df = pd.read_csv('../assist_material/datasets/extracted/q1/train.csv', sep=',')
train_df.columns = ['id', 'title', 'content', 'label']

test_df = pd.read_csv('../assist_material/datasets/extracted/q1/test_without_labels.csv', sep=',')
test_df.columns = ['id', 'title', 'content']

X_train = (train_df['title'] + ' ') * 3 + train_df['content']
y_train = train_df['label']

X_test = (test_df['title'] + ' ') * 3 + test_df['content']

stop_words = ENGLISH_STOP_WORDS.union(['will', 's', 't', 'one', 'new', 'said', 'say', 'says', 'year'])

vectorizer_tuned = TfidfVectorizer(lowercase=True, stop_words=stop_words, ngram_range=(1,1), max_features=50000)
svd_tuned = TruncatedSVD(n_components=1000)

random_forest_tuned = RandomForestClassifier(n_estimators=1000, max_features='sqrt', n_jobs=-1)

random_forest_tfidf_svd_tuned = make_pipeline(vectorizer_tuned, svd_tuned, random_forest_tuned)

random_forest_tfidf_svd_tuned.fit(X_train, y_train)
y_pred = random_forest_tfidf_svd_tuned.predict(X_test)


In [3]:
prediction = pd.DataFrame(data={'Id': test_df['id'], 'Predicted': y_pred}).to_csv('testSet_categories.csv', index=False)