In [29]:
import pandas as pd
import numpy as np
import nltk
import re
import pickle
import string
from sklearn.model_selection import train_test_split, GridSearchCV
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv("YouTube Titles and description using youtube api.csv")

In [3]:
df.isnull().sum()

Video Id        0
Title           0
Description    40
Category        0
dtype: int64

In [4]:
df = df.dropna(how="any")

In [5]:
df.isnull().sum()

Video Id       0
Title          0
Description    0
Category       0
dtype: int64

In [6]:
df.shape

(2566, 4)

# Preprocess

In [7]:
stop_words_list = stopwords.words("english")

In [8]:
def pre_process(text):
    text = text.lower()
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\d", " ", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = text.split()
    words = [word for word in words if not word in stop_words_list]
    words = [re.sub(r"(.)\1{1,}", r"\1\1", word) for word in words]
    words = [word.strip() for word in words if len(word.strip()) > 1]
    text = " ".join(words)
    return text

In [9]:
df.columns

Index(['Video Id', 'Title', 'Description', 'Category'], dtype='object')

In [10]:
df["Title"] = df["Title"].apply(pre_process)
df["Description"] = df["Description"].apply(pre_process)

In [None]:
le = LabelEncoder().fit(df["Category"])
le_nm = dict(zip(le.classes_, le.transform(le.classes_)))
df["Category"] = df["Category"].apply(lambda x: le_nm[x])

In [12]:
df["Category"].value_counts()

5    1346
3     248
4     245
0     244
1     243
2     240
Name: Category, dtype: int64

In [13]:
le_nm

{'food': 0,
 'history': 1,
 'manufacturing': 2,
 'music': 3,
 'science': 4,
 'travel': 5}

In [14]:
count_vectorizer = CountVectorizer(max_features=10000)
X = count_vectorizer.fit_transform(df["Description"], df["Title"]).toarray()
y = df["Category"].values

In [15]:
X.shape

(2566, 10000)

In [16]:
tfidf_transformer = TfidfTransformer().fit_transform(X)

# Model Training

In [17]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_transformer, y, test_size=0.2, random_state=4242)

In [18]:
rf_model = RandomForestClassifier(n_estimators=1000, criterion="entropy")
rf_model.fit(X_train, y_train)

In [19]:
y_pred = rf_model.predict(X_test)
print("Random Forest: ", accuracy_score(y_test, y_pred))

Random Forest:  0.8910505836575876


In [29]:
rf_params = {
    "n_estimators": [100, 250, 500],
    "max_depth": [1, 15, 30],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [2, 5],
    "max_features": ["sqrt"],
    "ccp_alpha": [1, 2, 2.5]
}

In [30]:
rf = RandomForestClassifier()
rf_cv_model = GridSearchCV(rf, rf_params, cv=10, n_jobs=-1).fit(X_train, y_train)
rf_cv_model.best_params_

{'ccp_alpha': 1,
 'max_depth': 1,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100}

In [31]:
rf_tuned = RandomForestClassifier(ccp_alpha=1, max_depth=1, max_features="sqrt", min_samples_leaf=2, min_samples_split=2, n_estimators=100)
rf_tuned.fit(X_train, y_train)
y_pred = rf_tuned.predict(X_test)
print("Random Forest: ", accuracy_score(y_test, y_pred))

Random Forest:  0.5252918287937743


In [25]:
svc_model = SVC()
svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)
print("SVC: ", accuracy_score(y_test, y_pred))

SVC:  0.8482490272373541


In [27]:
ada_model = AdaBoostClassifier()
ada_model.fit(X_train, y_train)
y_pred = ada_model.predict(X_test)
print("Adaboost: ", accuracy_score(y_test, y_pred))

Adaboost:  0.7529182879377432


In [28]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print("Logistic Regression: ", accuracy_score(y_test, y_pred))

Logistic Regression:  0.8424124513618677


In [30]:
bagging_model = BaggingClassifier()
bagging_model.fit(X_train, y_train)
y_pred = bagging_model.predict(X_test)
print("Bagging: ", accuracy_score(y_test, y_pred))

Bagging:  0.8929961089494164


In [31]:
gradient_model = GradientBoostingClassifier()
gradient_model.fit(X_train, y_train)
y_pred = gradient_model.predict(X_test)
print("Gradient Boosting: ", accuracy_score(y_test, y_pred))

Gradient Boosting:  0.9066147859922179


In [20]:
pickle.dump(rf_model, open("RandomForest.pkl", "wb"))

In [21]:
pickle.dump(tfidf_transformer, open("TFIDF.pkl", "wb"))