# Modeling


1. Find the best Modal (tuning)

2. Exploratary Data Analysis

3. Deployment


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pylab as plt
import pickle
import joblib
import re
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split,cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from xgboost import XGBClassifier

plt.style.use('ggplot')
pd.options.display.max_rows = 4000

In [43]:
df = pd.read_csv(r'C:\Users\LEGION\Desktop\MMU\Data Science Fundamental\Project\Prediction of Video\dataset\merged.csv')

In [44]:
df.columns

Index(['Unnamed: 0', 'videoID', 'title', 'publishedAt', 'channelId',
       'channelTitle', 'categoryId', 'tags', 'views', 'likes', 'comments',
       'descriptionLength', 'thumbnailLink', 'dayOfWeek', 'daytime',
       'duration', 'titleLength', 'questionMark', 'exclamationMark',
       'sentimentScore', 'fullCapSentence', 'fullCapCount', 'trendingOrNot'],
      dtype='object')

In [45]:
df = df.drop(['Unnamed: 0','videoID','publishedAt','channelId','channelTitle','thumbnailLink','views','likes','comments','tags'],axis=1)
df['categoryId'] = df['categoryId'].astype('category')
df['dayOfWeek'] = df['dayOfWeek'].astype('category')
df['daytime'] = df['daytime'].astype('category')

In [46]:
stop_words = set(stopwords.words('english'))  # set of English stop words
lemmatizer = WordNetLemmatizer()
def preprocess(text,target_language='en'):
    if not isinstance(text, str):
        try:
            text = str(text)
        except:
            raise TypeError('Input must be a string or a float')     
    # convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text) 
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Removing repeated characters
    text = re.sub(r'(.)\1{2,}', r'\1', text)

    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(w) for w in words]
    words = [w for w in words if not w in stop_words]
    return words

# Applying preprocessing function to title column
df['cleanTitle'] = df['title'].apply(preprocess)
df['cleanTitle'] = df['cleanTitle'].apply(lambda x: ' '.join(x))


In [47]:
df.columns

Index(['title', 'categoryId', 'descriptionLength', 'dayOfWeek', 'daytime',
       'duration', 'titleLength', 'questionMark', 'exclamationMark',
       'sentimentScore', 'fullCapSentence', 'fullCapCount', 'trendingOrNot',
       'cleanTitle'],
      dtype='object')

# Feature Selection

In [31]:
# Define a list of input features to test
# Define the feature indices
feature_indices = [
    [0, 2, 4],                  # ['cleanTitle', 'categoryId', 'duration']
    [0, 2, 4, 5, 6, 7, 8, 9],   # ['cleanTitle', 'categoryId', 'duration', 'sentimentScore', 'questionMark', 'exclamationMark', 'fullCapSentence', 'fullCapCount']
    [0, 2, 4, 10, 11],          # ['cleanTitle', 'categoryId', 'duration', 'dayOfWeek', 'daytime']
    [0, 2, 4, 5, 6, 7, 8, 9, 10, 11],   # ['cleanTitle', 'categoryId', 'duration', 'sentimentScore', 'questionMark', 'exclamationMark', 'fullCapSentence', 'fullCapCount', 'dayOfWeek', 'daytime']
    [0, 2, 4, 1],               # ['cleanTitle', 'categoryId', 'duration', 'titleLength']
    [0, 2, 4, 3],               # ['cleanTitle', 'categoryId', 'duration', 'descriptionLength']
    [0, 2, 4, 3, 1],             # ['cleanTitle', 'categoryId', 'duration', 'descriptionLength', 'titleLength']
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]   # ['cleanTitle', 'titleLength', 'categoryId', 'descriptionLength', 'duration', 'sentimentScore', 'questionMark', 'exclamationMark', 'fullCapSentence', 'fullCapCount', 'dayOfWeek', 'daytime']
]
# Define the dictionary mapping indices to feature names
feature_mapping = {
    0: 'cleanTitle',
    1: 'titleLength',
    2: 'categoryId',
    3: 'descriptionLength',
    4: 'duration',
    5: 'sentimentScore',
    6: 'questionMark', 
    7: 'exclamationMark',
    8: 'fullCapSentence',
    9: 'fullCapCount',
    10: 'dayOfWeek',
    11: 'daytime'
}

# Generate the feature sets using the indices
feature_sets = [[feature_mapping[i] for i in indices] for indices in feature_indices]

# Define the target variable
target_variable = 'trendingOrNot'


models = [
    ('Logistic Regression', LogisticRegression()),
    ('XGBoost', XGBClassifier())
    # ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42))
]

for features in feature_sets:
    X = df[features]
    y = df[target_variable]
    
   # Define the column transformer based on the feature set
    column_transformer = ColumnTransformer(transformers=[
        ('text', TfidfVectorizer(min_df=1, stop_words='english'), 'cleanTitle')
    ])
    
    if any(feature in ['titleLength', 'duration', 'descriptionLength','sentimentScore'] for feature in features):
        numeric_features = [feat for feat in features if feat in ['titleLength', 'duration', 'descriptionLength','sentimentScore']]
        column_transformer.transformers.append(('numeric', StandardScaler(), numeric_features))
        
    # if 'categoryId' in features:
    #     column_transformer.transformers.append(('categorical', TargetEncoder(), ['categoryId']))

    if any(feature in ['categoryId', 'dayOfWeek', 'daytime'] for feature in features):
        categorical_features = [feat for feat in features if feat in ['categoryId', 'dayOfWeek', 'daytime']]
        column_transformer.transformers.append(('categorical', TargetEncoder(), categorical_features))

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Oversample the training data
    ros = RandomOverSampler(random_state=0)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    
    # Perform cross-validation and select the best model
    best_model = None
    best_score = 0
    
    for name, model in models:
        pipeline = Pipeline([
            ('preprocessor', column_transformer),
            ('classifier', model)
        ])
        
        # Perform 5-fold cross-validation
        scores = cross_val_score(pipeline, X_train_resampled, y_train_resampled, cv=5)
        
        # Calculate the mean accuracy score
        mean_score = scores.mean()
        
        # Print the results for the current model and feature set
        print(f"Features: {features}")
        print(f"{name} Mean Accuracy: {mean_score}")
        
        # Update the best model if necessary
        if mean_score > best_score:
            best_model = model
            best_score = mean_score
    
    # Fit the best model on the resampled training data
    best_pipeline = Pipeline([
        ('preprocessor', column_transformer),
        ('classifier', best_model)
    ])
    best_pipeline.fit(X_train_resampled, y_train_resampled)
    
    # Make predictions on the test set and evaluate the model
    y_pred = best_pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("---------------------")
    print("Selected Features:", features)
    print("Best Model Accuracy:", accuracy)
    print(classification_report(y_test, y_pred))
    print("---------------------")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Features: ['cleanTitle', 'categoryId', 'duration']
Logistic Regression Mean Accuracy: 0.8222124782273056
Features: ['cleanTitle', 'categoryId', 'duration']
XGBoost Mean Accuracy: 0.8066133733785886


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


---------------------
Selected Features: ['cleanTitle', 'categoryId', 'duration']
Best Model Accuracy: 0.800750794109154
              precision    recall  f1-score   support

           0       0.90      0.81      0.85     17289
           1       0.62      0.77      0.69      6952

    accuracy                           0.80     24241
   macro avg       0.76      0.79      0.77     24241
weighted avg       0.82      0.80      0.81     24241

---------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Features: ['cleanTitle', 'categoryId', 'duration', 'sentimentScore', 'questionMark', 'exclamationMark', 'fullCapSentence', 'fullCapCount']
Logistic Regression Mean Accuracy: 0.8187636378837361
Features: ['cleanTitle', 'categoryId', 'duration', 'sentimentScore', 'questionMark', 'exclamationMark', 'fullCapSentence', 'fullCapCount']
XGBoost Mean Accuracy: 0.8061155196603806


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


---------------------
Selected Features: ['cleanTitle', 'categoryId', 'duration', 'sentimentScore', 'questionMark', 'exclamationMark', 'fullCapSentence', 'fullCapCount']
Best Model Accuracy: 0.7986881729301597
              precision    recall  f1-score   support

           0       0.90      0.81      0.85     17289
           1       0.62      0.78      0.69      6952

    accuracy                           0.80     24241
   macro avg       0.76      0.79      0.77     24241
weighted avg       0.82      0.80      0.80     24241

---------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Features: ['cleanTitle', 'categoryId', 'duration', 'dayOfWeek', 'daytime']
Logistic Regression Mean Accuracy: 0.8215991895994783
Features: ['cleanTitle', 'categoryId', 'duration', 'dayOfWeek', 'daytime']
XGBoost Mean Accuracy: 0.8068730941165011


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


---------------------
Selected Features: ['cleanTitle', 'categoryId', 'duration', 'dayOfWeek', 'daytime']
Best Model Accuracy: 0.7993482117074379
              precision    recall  f1-score   support

           0       0.90      0.81      0.85     17289
           1       0.62      0.78      0.69      6952

    accuracy                           0.80     24241
   macro avg       0.76      0.79      0.77     24241
weighted avg       0.82      0.80      0.81     24241

---------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Features: ['cleanTitle', 'categoryId', 'duration', 'sentimentScore', 'questionMark', 'exclamationMark', 'fullCapSentence', 'fullCapCount', 'dayOfWeek', 'daytime']
Logistic Regression Mean Accuracy: 0.8187203816782805
Features: ['cleanTitle', 'categoryId', 'duration', 'sentimentScore', 'questionMark', 'exclamationMark', 'fullCapSentence', 'fullCapCount', 'dayOfWeek', 'daytime']
XGBoost Mean Accuracy: 0.8075801872086009


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


---------------------
Selected Features: ['cleanTitle', 'categoryId', 'duration', 'sentimentScore', 'questionMark', 'exclamationMark', 'fullCapSentence', 'fullCapCount', 'dayOfWeek', 'daytime']
Best Model Accuracy: 0.7999669980611361
              precision    recall  f1-score   support

           0       0.90      0.81      0.85     17289
           1       0.62      0.77      0.69      6952

    accuracy                           0.80     24241
   macro avg       0.76      0.79      0.77     24241
weighted avg       0.82      0.80      0.81     24241

---------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Features: ['cleanTitle', 'categoryId', 'duration', 'titleLength']
Logistic Regression Mean Accuracy: 0.8219310801916595
Features: ['cleanTitle', 'categoryId', 'duration', 'titleLength']
XGBoost Mean Accuracy: 0.8093839819603736


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


---------------------
Selected Features: ['cleanTitle', 'categoryId', 'duration', 'titleLength']
Best Model Accuracy: 0.7983581535415205
              precision    recall  f1-score   support

           0       0.90      0.81      0.85     17289
           1       0.62      0.78      0.69      6952

    accuracy                           0.80     24241
   macro avg       0.76      0.79      0.77     24241
weighted avg       0.82      0.80      0.80     24241

---------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Features: ['cleanTitle', 'categoryId', 'duration', 'descriptionLength']
Logistic Regression Mean Accuracy: 0.8177463312368973
Features: ['cleanTitle', 'categoryId', 'duration', 'descriptionLength']
XGBoost Mean Accuracy: 0.8117288842931905


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


---------------------
Selected Features: ['cleanTitle', 'categoryId', 'duration', 'descriptionLength']
Best Model Accuracy: 0.7986469205065798
              precision    recall  f1-score   support

           0       0.90      0.81      0.85     17289
           1       0.62      0.77      0.69      6952

    accuracy                           0.80     24241
   macro avg       0.76      0.79      0.77     24241
weighted avg       0.82      0.80      0.80     24241

---------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Features: ['cleanTitle', 'categoryId', 'duration', 'descriptionLength', 'titleLength']
Logistic Regression Mean Accuracy: 0.8187997220299973
Features: ['cleanTitle', 'categoryId', 'duration', 'descriptionLength', 'titleLength']
XGBoost Mean Accuracy: 0.8130131750992893


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


---------------------
Selected Features: ['cleanTitle', 'categoryId', 'duration', 'descriptionLength', 'titleLength']
Best Model Accuracy: 0.7963367847861061
              precision    recall  f1-score   support

           0       0.90      0.80      0.85     17289
           1       0.61      0.78      0.69      6952

    accuracy                           0.80     24241
   macro avg       0.76      0.79      0.77     24241
weighted avg       0.82      0.80      0.80     24241

---------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Features: ['cleanTitle', 'titleLength', 'categoryId', 'descriptionLength', 'duration', 'sentimentScore', 'questionMark', 'exclamationMark', 'fullCapSentence', 'fullCapCount', 'dayOfWeek', 'daytime']
Logistic Regression Mean Accuracy: 0.8174937901066188
Features: ['cleanTitle', 'titleLength', 'categoryId', 'descriptionLength', 'duration', 'sentimentScore', 'questionMark', 'exclamationMark', 'fullCapSentence', 'fullCapCount', 'dayOfWeek', 'daytime']
XGBoost Mean Accuracy: 0.8164331257408209


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


---------------------
Selected Features: ['cleanTitle', 'titleLength', 'categoryId', 'descriptionLength', 'duration', 'sentimentScore', 'questionMark', 'exclamationMark', 'fullCapSentence', 'fullCapCount', 'dayOfWeek', 'daytime']
Best Model Accuracy: 0.7974093477991832
              precision    recall  f1-score   support

           0       0.90      0.81      0.85     17289
           1       0.62      0.78      0.69      6952

    accuracy                           0.80     24241
   macro avg       0.76      0.79      0.77     24241
weighted avg       0.82      0.80      0.80     24241

---------------------


In [33]:
# Define the column transformer
column_transformer = ColumnTransformer([
    ('text', TfidfVectorizer(min_df=1, stop_words='english'), 'cleanTitle'),
    ('numeric', StandardScaler(), ['duration']),
    ('categorical', TargetEncoder(), ['categoryId'])
])

# Split the data into training and testing sets
X = df[['cleanTitle', 'categoryId','duration']]
y = df['trendingOrNot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Oversample the training data
ros = RandomOverSampler(random_state=0)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Define the pipeline with preprocessing and modeling steps
pipelines = {
    'Logistic Regression': Pipeline([
        ('preprocessor', column_transformer),
        ('classifier', LogisticRegression(random_state=42))
    ]),
    'XGBoost': Pipeline([
        ('preprocessor', column_transformer),
        ('classifier', XGBClassifier(random_state=42))
    ]),
    'Random Forest': Pipeline([
        ('preprocessor', column_transformer),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
}

# Define the parameter grids for each model
param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__penalty': ['l1', 'l2']
    },
    'XGBoost': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2]
    },
    'Random Forest': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10]
    }
}

# Perform hyperparameter tuning and model comparison
results = {}
for model_name, pipeline in pipelines.items():
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train_resampled, y_train_resampled)
    
    best_score = grid_search.best_score_
    best_params = grid_search.best_params_
    
    results[model_name] = {
        'best_score': best_score,
        'best_params': best_params
    }

# Print the results
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Best Score: {result['best_score']}")
    print(f"Best Parameters: {result['best_params']}")
    print()

# Fit the best model on the resampled training data
best_model_name = max(results, key=lambda x: results[x]['best_score'])
best_model = pipelines[best_model_name]
best_params = results[best_model_name]['best_params']
best_model.set_params(**best_params)
best_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set and evaluate the best model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model: Logistic Regression
Best Score: 0.8274651276386548
Best Parameters: {'classifier__C': 10.0, 'classifier__penalty': 'l2'}

Model: XGBoost
Best Score: 0.8232875059730287
Best Parameters: {'classifier__learning_rate': 0.2, 'classifier__n_estimators': 300}

Model: Random Forest
Best Score: 0.9277840773414207
Best Parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 100}

Accuracy: 0.84893362485046
              precision    recall  f1-score   support

           0       0.90      0.89      0.89     17289
           1       0.73      0.75      0.74      6952

    accuracy                           0.85     24241
   macro avg       0.81      0.82      0.82     24241
weighted avg       0.85      0.85      0.85     24241



In [185]:
from category_encoders import TargetEncoder, CountEncoder, CatBoostEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Define the column transformer
column_transformer = ColumnTransformer([
    ('text', TfidfVectorizer(min_df=1, stop_words='english'), 'cleanTitle'),
    ('numeric', StandardScaler(), ['duration']),
    ('categorical', TargetEncoder(), ['categoryId'])
])

# Split the data into training and testing sets
X = df[['cleanTitle', 'categoryId', 'duration']]
y = df['trendingOrNot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Oversample the training data
ros = RandomOverSampler(random_state=0)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Define the pipeline with preprocessing and modeling steps
pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42))
])

# Define the parameter grid for encoders and scalers
param_grid = {
    'preprocessor__categorical': [TargetEncoder(), CountEncoder(), CatBoostEncoder()],
    'preprocessor__numeric': [StandardScaler(), MinMaxScaler()]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best model and its evaluation on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Best Model Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

# Get the best encoder and scaler names
best_encoder_name = grid_search.best_params_['preprocessor__categorical'].__class__.__name__
best_scaler_name = grid_search.best_params_['preprocessor__numeric'].__class__.__name__

print("Best Encoder:", best_encoder_name)
print("Best Scaler:", best_scaler_name)

Best Model Accuracy: 0.84893362485046
              precision    recall  f1-score   support

           0       0.90      0.89      0.89     17289
           1       0.73      0.75      0.74      6952

    accuracy                           0.85     24241
   macro avg       0.81      0.82      0.82     24241
weighted avg       0.85      0.85      0.85     24241

Best Encoder: TargetEncoder
Best Scaler: StandardScaler


In [None]:
# Define the column transformer
column_transformer = ColumnTransformer([
    ('text', TfidfVectorizer(min_df=1, stop_words='english'), 'cleanTitle'),
    ('numeric', StandardScaler(), ['duration']),
    ('categorical',TargetEncoder(),['categoryId'])
])

# Split the data into training and testing sets
X = df[['cleanTitle', 'categoryId','duration']]
y = df['trendingOrNot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Oversample the training data
ros = RandomOverSampler(random_state=0)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Define the pipeline with preprocessing and modeling steps
pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('classifier', RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42))
])

# Fit the pipeline on the resampled training data
pipeline.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set and evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

In [35]:
# Save the trained model
joblib.dump(pipeline, '85pct(new).pkl')


['85pct(new).pkl']

# Deployment

In [7]:
import re
from urllib.parse import urlparse, parse_qs

def get_video_id(url):
    video_id = None
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    if parsed_url.netloc == 'youtu.be':
        video_id = parsed_url.path[1:]
    elif parsed_url.netloc in ('www.youtube.com', 'youtube.com'):
        if 'v' in query_params:
            video_id = query_params['v'][0]
    return video_id

In [133]:
from googleapiclient.discovery import build
import isodate

# Set up the YouTube Data API client
api_keys = ['AIzaSyC4hp-RHBw5uY4NcthYw-A2fqYyrG22kaE',
'AIzaSyC7KzwigUsNJ4KNvqGfPqXVK9QcDBsKU78',
'AIzaSyDEPBCb1PhEaYHuBgzW6D5-ldTHUCowuq4',
'AIzaSyD-LN8Z7xG8OHtMQ89GRDvIaRQwkVHzfEo',
'AIzaSyCW5J_uI37UPmq3mJVAhVdWNdGSMAMg5tI',
'AIzaSyC8VVO0DhDY91lfyqqaUW85VKriqBiahBA',
'AIzaSyDC744JL3Xa3eORSxORoxKpunKFPPMGb3Y',
'AIzaSyA-DwJmtgWFO-I-Dwv1hcISJKXGDjbpZok',
'AIzaSyDC744JL3Xa3eORSxORoxKpunKFPPMGb3Y',
'AIzaSyD74KqDih_2AyOIJV-HaIvU9DdUOIyRONs',
'AIzaSyALgq5vR27iGsuFuLiz-Ry4NGy6E-L1PUY',
'AIzaSyC4hp-RHBw5uY4NcthYw-A2fqYyrG22kaE']
current_key_index = 0

def get_next_api_key():
    global current_key_index
    current_key_index = (current_key_index + 1) % len(api_keys)
    return api_keys[current_key_index]
def get_video_metadata(video_id):
    try:
        # Get the next API key
        api_key = get_next_api_key()

        # Set up the YouTube Data API client
        youtube = build('youtube', 'v3', developerKey=api_key)

        # Call the API to retrieve video metadata
        response = youtube.videos().list(
            part='snippet,contentDetails,statistics',
            id=video_id
        ).execute()

        # Extract the relevant metadata
        if 'items' in response and len(response['items']) > 0:
            video = response['items'][0]
            metadata = {
                'title': video['snippet']['title'],
                'description': video['snippet']['description'],
                'channel_title': video['snippet']['channelTitle'],
                'publish_date': video['snippet']['publishedAt'],
                'duration': video['contentDetails']['duration'],
                'views': video['statistics']['viewCount'],
                'likes': video['statistics'].get('likeCount', 0),
                'comments': video['statistics'].get('commentCount', 0),
                'category_id': video['snippet']['categoryId'],
                'thumbnail_link': video['snippet']['thumbnails']['default']['url']
            }
            return metadata

    except Exception as e:
        print("An error occurred:", str(e))

    return None

# Example usage
video_id = get_video_id("https://www.youtube.com/watch?v=V1CIv_Cg_Qg")
metadata = get_video_metadata(video_id)

# Create a DataFrame from the metadata
data = pd.DataFrame([metadata])
data['duration'] = data['duration'].apply(lambda x: isodate.parse_duration(x).total_seconds())
data['cleanTitle'] = data['title'].apply(preprocess)
data['cleanTitle'] = data['cleanTitle'].apply(lambda x: ' '.join(x))
data['titleLength'] = data['title'].apply(lambda x: len(x))
data['descriptionLength'] : data['description'].apply(lambda x: len(x))
# Display the DataFrame
data

Unnamed: 0,title,description,channel_title,publish_date,duration,views,likes,comments,category_id,thumbnail_link,cleanTitle,titleLength
0,Incredible CREEP Blind Auditions in The Voice ...,These amazing kids sing Radioheads' Queen in T...,The Voice Global,2021-01-29T15:00:13Z,652.0,679890,11049,0,24,https://i.ytimg.com/vi/V1CIv_Cg_Qg/default.jpg,incredible creep blind audition voice kid,50


In [182]:
import joblib

# Load the saved pipeline
pipeline2 = joblib.load('85pct.pkl')

test = data[['cleanTitle','category_id','duration']]
test = test.rename(columns={'category_id': 'categoryId'})
test['categoryId'] = test['categoryId'].astype('category')
print(pipeline.predict_proba(test))
test = data[['cleanTitle','titleLength','category_id','duration']]
test = test.rename(columns={'category_id': 'categoryId'})
test['categoryId'] = 29
test['categoryId'] = test['categoryId'].astype('category')
pipeline2.predict_proba(test)

[[0.52 0.48]]


array([[0.76, 0.24]])

In [73]:
pipeline = joblib.load('85pct.pkl')
# Define the new data point
new_title = "Intelligent AI Chatbot in Python"
clean_new_title = preprocess(new_title)
# Join the preprocessed words back into a string
clean_new_title_str = ' '.join(clean_new_title)

data1 = {
    'cleanTitle': [clean_new_title_str],
    'titleLength': [32],
    'categoryId': [28],
    'duration': [2142.0]
}

test2 = pd.DataFrame(data1)
test2['categoryId'] = test2['categoryId'].astype('category')
pipeline.predict(test2)



# Junk

In [18]:
# Mutual Information 
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
X = df[['duration','titleLength','descriptionLength','sentimentScore']]
y = df['trendingOrNot']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X,y,test_size=0.3,random_state=0
)
mutual_info = mutual_info_classif(X_train,y_train)
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

descriptionLength    0.024141
duration             0.021582
titleLength          0.020980
sentimentScore       0.016805
dtype: float64

In [40]:
#Chisquare Test For Feature Selection
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df[['categoryId', 'daytime', 'dayOfWeek','questionMark','exclamationMark','fullCapSentence']],
                                              df['trendingOrNot'],test_size=0.3,random_state=100)

from sklearn.feature_selection import chi2
f_p_values=chi2(X_train,y_train)
p_values=pd.Series(f_p_values[1])
p_values.index=X_train.columns
p_values

categoryId         0.000000e+00
daytime            5.508567e-35
dayOfWeek          3.568344e-17
questionMark       1.194363e-20
exclamationMark    1.769994e-29
fullCapSentence    5.243778e-07
dtype: float64

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from scipy.sparse import hstack
# vectorizer_filename = 'vectorizer.sav'
# with open(vectorizer_filename, 'rb') as f:
#     vectorizer = pickle.load(f)

vectorizer = TfidfVectorizer(min_df=1,stop_words='english')
X_title = vectorizer.fit_transform(df['cleanTitle'])

# Load the selector_title from file
# selector_title_filename = 'selector_title.sav'
# with open(selector_title_filename, 'rb') as f:
#     selector_title = pickle.load(f)
# Load the saved model from a file
# Extract features using TF-IDF


# Select the top 1000 features based on chi-squared test
# X_title = selector_title.fit_transform(X_title, df['trendingOrNot'])

# Convert 'categoryId' and 'daysOfUploading' to a numpy array
# X_numeric = df[['categoryId','title_length','duration']].to_numpy()
# Scale 'title_length' and 'duration' features
scaler = StandardScaler()
X_numeric = scaler.fit_transform(df[['titleLength', 'duration']].to_numpy())

# Include 'categoryId' as a feature
X_cat = df[['categoryId']].to_numpy()

# Concatenate the selected features and convert to a numpy array
X = hstack([X_title, X_numeric, X_cat])

# Split the data into training and testing sets
y = df['trendingOrNot']
# smote = SMOTE()
# X_resampled, y_resampled = smote.fit_resample(X, y)
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train the SGD classifier model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set and evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))


Accuracy: 0.9244881527490223
              precision    recall  f1-score   support

           0       0.96      0.88      0.92     17377
           1       0.89      0.96      0.93     17399

    accuracy                           0.92     34776
   macro avg       0.93      0.92      0.92     34776
weighted avg       0.93      0.92      0.92     34776



In [None]:
unique_categories = df['categoryId'].unique()

# Get the embeddings for each unique category
category_embeddings = []
for category in unique_categories:
    embedding = model.predict(np.array([[category]]))
    category_embeddings.append(embedding[0])
    
# Create a new dataframe with the category IDs and their corresponding embeddings
embeddings_df = pd.DataFrame({'categoryId': unique_categories, 'embedding': category_embeddings})
embeddings_df

In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate


# Define the numerical and categorical features
numerical_features = ['sentimentScore']
categorical_features = ['categoryId','dayOfWeek']

# Split the data into training and testing sets
X = df[['categoryId', 'sentimentScore','dayOfWeek']]
y = df['trendingOrNot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the preprocessing pipelines for the numerical and categorical data
numerical_transformer = StandardScaler()

# Fit the numerical transformer on the training data
numerical_transformer.fit(X_train[numerical_features])

# Transform the training and testing numerical data
X_train_numerical = numerical_transformer.transform(X_train[numerical_features])
X_test_numerical = numerical_transformer.transform(X_test[numerical_features])

# Define the input layers for the numerical and categorical data
num_inputs = Input(shape=(X_train_numerical.shape[1],), name='num')
cat_inputs = Input(shape=(1,), name='cat')

# Define the embedding layer for the categorical data
cat_embed = Embedding(input_dim=15, output_dim=15, name='cat_embed')(cat_inputs)
cat_flat = Flatten()(cat_embed)

# Concatenate the numerical and categorical inputs
concatenated = Concatenate()([num_inputs, cat_flat])

# Define the dense layers for the model
dense_layer_1 = Dense(units=64, activation='relu')(concatenated)
dense_layer_2 = Dense(units=32, activation='relu')(dense_layer_1)
output_layer = Dense(units=1, activation='sigmoid')(dense_layer_2)

# Define the model
model = Model(inputs=[num_inputs, cat_inputs], outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([X_train_numerical, X_train[categorical_features].values], y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate([X_test_numerical, X_test[categorical_features].values], y_test)
print('Test accuracy:', accuracy)

In [None]:
words = np.array(vectorizer.get_feature_names())

x = np.eye(X_test.shape[1])
probs = clf.predict_log_proba(x)[:, 0]
ind = np.argsort(probs)

good_words = words[ind[:100]]
bad_words = words[ind[-100:]]

good_prob = probs[ind[:100]]
bad_prob = probs[ind[-100:]]

print("Good words\t     P(good | word)")
for w, p in zip(good_words, good_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))
    
print("Bad words\t     P(good | word)")
for w, p in zip(bad_words, bad_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))



IndexError: index 46431 is out of bounds for axis 0 with size 46430

In [None]:
from sklearn.naive_bayes import MultinomialNB

#the grid of parameters to search over
alphas = [.1, 1, 5]
min_dfs = [1]

#Find the best value for alpha and min_df, and the best classifier
best_alpha = None
best_min_df = None
maxscore=-np.inf
for alpha in alphas:
    for min_df in min_dfs:
        vectorizer = CountVectorizer(min_df = min_df)
        Xthis, ythis = make_xy(df, vectorizer)
        Xtrainthis=Xthis[mask]
        ytrainthis=ythis[mask]
        clf = MultinomialNB(alpha=alpha)
        cvscore = cv_score(clf, Xtrainthis, ytrainthis, log_likelihood)

        if cvscore > maxscore:
            maxscore = cvscore
            best_alpha, best_min_df = alpha, min_df

In [None]:
vectorizer = CountVectorizer(min_df=best_min_df)
X, y = make_xy(df, vectorizer)
xtrain=X[mask]
ytrain=y[mask]
xtest=X[~mask]
ytest=y[~mask]

clf = MultinomialNB(alpha=best_alpha).fit(xtrain, ytrain)

training_accuracy = clf.score(xtrain, ytrain)
test_accuracy = clf.score(xtest, ytest)

print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))

Accuracy on training data: 0.875608
Accuracy on test data:     0.768460


In [None]:
words = np.array(vectorizer.get_feature_names())

x = np.eye(xtest.shape[1])
probs = clf.predict_log_proba(x)[:, 0]
ind = np.argsort(probs)

good_words = words[ind[:100]]
bad_words = words[ind[-100:]]

good_prob = probs[ind[:100]]
bad_prob = probs[ind[-100:]]

print("Good words\t     P(good | word)")
for w, p in zip(good_words, good_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))
    
print("Bad words\t     P(good | word)")
for w, p in zip(bad_words, bad_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))



Good words	     P(good | word)
                2022 0.99
              shorts 0.99
          highlights 0.98
                2021 0.98
           minecraft 0.98
             oficial 0.97
                2023 0.96
                 ufc 0.96
               among 0.95
                  mv 0.95
                 lil 0.95
            survived 0.95
            hardcore 0.95
               built 0.94
         hermitcraft 0.94
                 nba 0.94
                 000 0.94
                 nbc 0.94
              lakers 0.93
                 bts 0.93
                  24 0.92
                  fc 0.92
               music 0.92
              reacts 0.92
              tiktok 0.92
              funkin 0.91
                 cbs 0.91
                ring 0.91
            pregnant 0.91
               elden 0.91
           breakdown 0.91
           challenge 0.91
              roblox 0.90
             genshin 0.90
                 snl 0.90
          manchester 0.90
              reveal 0.90
       

In [None]:
clf.predict_proba(vectorizer.transform(['']))

array([[0.46509481, 0.53490519]])

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
X = df[['title']]
y = df.trendingOrNot

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2)
# X_train, X_test, y_train, y_test = train_test_split(os_data_X, os_data_y,test_size = 0.2)
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',MultinomialNB())])
X_train = X_train['title'].values.ravel()
text_clf.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [None]:
accuracy = text_clf.score(X_test['title'], y_test)
print('Accuracy:', accuracy)

Accuracy: 0.7887056763461258


In [None]:
#Categorical Feature(Chi-Squared Test)
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state=123)

f_score=chi2(X_train,y_train)

pvalues = pd.Series(f_score[1])
pvalues.index = X_train.columns
pvalues.sort_values(ascending=False)

dayOfWeek     2.165008e-42
categoryId    0.000000e+00
views         0.000000e+00
likes         0.000000e+00
comments      0.000000e+00
duration      0.000000e+00
dtype: float64

In [None]:
#Regularization-based Feature Selection(L1 (Lasso) / L2(Ridge))
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state=123)

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression


sel = SelectFromModel(LogisticRegression(penalty="l2",C=1,solver="liblinear"))

sel.fit(X_train,y_train)

sel.get_feature_names_out()

array(['likes', 'comments'], dtype=object)

In [None]:
# Mutual Information
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
X = df[['categoryId',  'dayOfWeek','daytime','title_length','duration']]
y = df['trendingOrNot']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state=123)
mutual_info = mutual_info_classif(X_train,y_train)
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

duration        0.059012
categoryId      0.044644
title_length    0.022751
dayOfWeek       0.005229
daytime         0.004172
dtype: float64

In [None]:
# Recursive Feature Elimination(RFE)

from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

X = df[['categoryId',  'dayOfWeek','daytime','title_length','duration']]
y = df['trendingOrNot']

rfe = RFE(estimator=DecisionTreeClassifier(),n_features_to_select=2)
rfe.fit(X,y)

RFE(estimator=DecisionTreeClassifier(), n_features_to_select=2)

In [None]:
for i, col in enumerate(X.columns):
    print(f"{col} selected={rfe.support_[i]} rank={rfe.ranking_[i]}")


categoryId selected=False rank=2
dayOfWeek selected=False rank=3
daytime selected=False rank=4
title_length selected=True rank=1
duration selected=True rank=1
