In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_log_error,mean_squared_error,roc_auc_score
from sklearn import metrics
from sklearn.linear_model import SGDRegressor,LinearRegression,Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor,GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.pipeline import Pipeline
from math import sqrt
from scipy.sparse import csr_matrix, hstack
from sklearn.svm import SVR
from sklearn import preprocessing
import xgboost as xgb
from sklearn.decomposition import PCA

In [None]:
data = pd.read_csv('./data/Data_Train.csv')

In [None]:
data.head()

In [None]:
data.drop(columns=['Unique_ID','Country','Name'],axis=1,inplace=True)

In [None]:
data.head()

In [None]:
#data = pd.get_dummies(data,columns=['Genre'],drop_first=True)

In [None]:
data.head()

In [None]:
data['Timestamp'] =pd.to_datetime(data['Timestamp'])
data['dayofweek'] = data['Timestamp'].dt.dayofweek #but mostly people on weekend
data['weekend'] = data['dayofweek'].apply(lambda x: 1 if (x>4)  else 0)
display(data.groupby(['dayofweek'])['Views'].mean())
display(data.groupby(['weekend'])['Views'].mean())

In [None]:
data.describe()
#if mean and median not equal outliers ??

In [None]:
data.info()

In [None]:
pd.plotting.scatter_matrix(data, alpha=0.3, figsize=(14,8), diagonal='kde');

In [None]:
#sns.regplot(x="Likes", y="Views", data=data)
#sns.regplot(x="Comments", y="Views", data=data)
#sns.regplot(x="Popularity", y="Views", data=data)
sns.regplot(x="Followers", y="Views", data=data)

In [None]:
def popularityConverting(x):
    if x[-1]=='K':
        x= ''.join(x[:-1].split(','))
        x=float(x)*1000
    elif x[-1]=='M':
        x= ''.join(x[:-1].split(','))
        x=float(x)*1000000
    else:
        x= ''.join(x.split(','))
    return x

In [None]:
data['Popularity'] = data.Popularity.apply(popularityConverting)
data['Popularity'] = data.Popularity.astype('float')
data['Likes'] = data.Likes.apply(popularityConverting)
data['Likes'] = data.Likes.astype('float')
data.head()

In [None]:
data.corr()

In [None]:
def text_process(text):
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    
    return " ".join(text)
data['Song_Name']=data['Song_Name'].astype('str')
data['Song_Name'] = data['Song_Name'].apply(text_process)

In [None]:
data['Song_Name'].head()

In [None]:
#trying for time as feature
display(data[data['Timestamp']==data['Timestamp'].min()])
display(data[data['Timestamp']==data['Timestamp'].max()])
data.sort_values(by=['Timestamp']).groupby(['Timestamp'])['Views'].value_counts()

In [None]:
features_data =data.loc[:,['Comments', 'Likes', 'Popularity', 'Followers','weekend']]
X = data.loc[:,['Comments', 'Likes', 'Popularity', 'Followers','weekend']]
y = data['Views']
X= StandardScaler().fit(X).transform(X)

In [None]:
pd.plotting.scatter_matrix(data.loc[:,['Views','Comments', 'Likes', 'Popularity', 'Followers','weekend']], alpha=0.3, figsize=(14,8), diagonal='kde');

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Build PCA using standarized trained data
pca = PCA(n_components=None, svd_solver="full")
pca.fit(StandardScaler().fit_transform(X_train))
print(pca.explained_variance_ratio_)
cum_var_exp = np.cumsum(pca.explained_variance_ratio_)
plt.figure(figsize=(10, 6))
plt.bar(range(1, 6), pca.explained_variance_ratio_, align="center",
        color='red', label="Individual explained variance")
plt.step(range(1, 6), cum_var_exp, where="mid", label="Cumulative explained variance")
plt.xticks(range(1, 6))
plt.legend(loc="best")
plt.xlabel("Principal component index", {"fontsize": 14})
plt.ylabel("Explained variance ratio", {"fontsize": 14})
plt.title("PCA on training data", {"fontsize": 16});
print(cum_var_exp)

In [None]:
lr_regressor =LinearRegression()
lr_regressor.fit(X_train, y_train)
preds = lr_regressor.predict(X_test)
plt.scatter(y_test,preds)
plt.grid()
plt.xlabel('Actual y')
plt.ylabel('Predicted y')
plt.title('scatter plot between actual y and predicted y')
plt.show()
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))


In [None]:
lasso_params = {'alpha':[0.005,0.001, 0.02, 0.03, 0.05, 0.06]}
laso_regressor=Lasso()
laso_regressor_cv=GridSearchCV(laso_regressor,lasso_params,cv=5)
laso_regressor_cv.fit(X,y)
print("tuned hpyerparameters :(best parameters) ",laso_regressor_cv.best_params_)
print("accuracy :",laso_regressor_cv.best_score_)

In [None]:
laso_regressor =Lasso(alpha= 0.001)
laso_regressor.fit(X_train, y_train)
preds = laso_regressor.predict(X_test)
plt.scatter(y_test,preds)
plt.grid()
plt.xlabel('Actual y')
plt.ylabel('Predicted y')
plt.title('scatter plot between actual y and predicted y')
plt.show()
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
rf_regressor= RandomForestRegressor()
hyperparam_grid={"n_estimators": [10, 50, 100],"max_features": ["sqrt", "log2", 0.4, 0.5],\
                 "min_samples_leaf": [1, 3, 5]}
rf_regressor_cv = GridSearchCV(rf_regressor,hyperparam_grid,cv=2)
rf_regressor_cv.fit(X,y)
print("tuned hpyerparameters :(best parameters) ",rf_regressor_cv.best_params_)
print("accuracy :",rf_regressor_cv.best_score_)

In [None]:
rf_regressor=RandomForestRegressor(max_features= 0.4, min_samples_leaf= 1, n_estimators=100)
rf_regressor.fit(X_train,y_train)

preds = rf_regressor.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
plt.scatter(y_test,preds)
plt.grid()
plt.xlabel('Actual y')
plt.ylabel('Predicted y')
plt.title('scatter plot between actual y and predicted y')
plt.show()
print("RMSE: %f" % (rmse))

In [None]:
feature_importance = rf_regressor.feature_importances_
print(np.argsort(rf_regressor.feature_importances_))
indices = np.argsort(rf_regressor.feature_importances_)[::-1]
print(indices)
plt.figure(figsize=(8, 5))
plt.xticks(range(1, 6), features_data.columns[indices], rotation=90)
plt.bar(range(1, 6), feature_importance[indices], align="center")
plt.title("Feature Importance", {"fontsize": 16});

In [None]:
xgb_regressor=xgb.XGBRegressor()

n_estimators = [100, 500]
max_depth = [2, 3, 5, 10, 15]
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    }

# Set up the random search with 4-fold cross validation
xgb_regressor_cv = RandomizedSearchCV(estimator=xgb_regressor,
            param_distributions=hyperparameter_grid,
            cv=2,
            scoring = 'neg_mean_absolute_error',
            return_train_score = True,
            random_state=42)
xgb_regressor_cv.fit(X_train,y_train)
xgb_regressor_cv.best_estimator_

In [None]:
#regressor=xgb.XGBRegressor(objective ='reg:squarederror',n_estimators=100, min_child_weight=3, max_depth=10, learning_rate=0.1, booster='gbtree', base_score=0.25)
xgb_regressor= xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.15, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=None, n_estimators=500,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)
xgb_regressor.fit(X_train,y_train)

preds = xgb_regressor.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
plt.scatter(y_test,preds)
plt.grid()
plt.xlabel('Actual y')
plt.ylabel('Predicted y')
plt.title('scatter plot between actual y and predicted y')
plt.show()
print("RMSE: %f" % (rmse))

In [None]:
#outliers
xgb_regressor.feature_importances_

In [None]:
feature_importance = xgb_regressor.feature_importances_
print(np.argsort(xgb_regressor.feature_importances_))
indices = np.argsort(xgb_regressor.feature_importances_)[::-1]
print(indices)
plt.figure(figsize=(8, 5))
plt.xticks(range(1, 6), features_data.columns[indices], rotation=90)
plt.bar(range(1, 6), feature_importance[indices], align="center")
plt.title("Feature Importance", {"fontsize": 16});

In [None]:
estimators = {"Linear Regressor": lr_regressor,
              "Lasso Regressor": laso_regressor,
              "Random Forest Regressor": rf_regressor,
              "XGB Regressor": xgb_regressor}
for estimator in estimators.keys():
    print('RMSE {}:{:.2f}'.format(estimator,np.sqrt(mean_squared_error(y_test, estimators[estimator].predict(X_test)))))

In [None]:
data_test = pd.read_csv('./Data_Test.csv')
display(data_test.head())
label= data_test['Unique_ID']
data_test.drop(columns=['Unique_ID','Country','Name','Song_Name'],axis=1,inplace=True)
data_test['Timestamp'] =pd.to_datetime(data_test['Timestamp'])
data_test['dayofweek'] = data_test['Timestamp'].dt.dayofweek #but mostly people on weekend
data_test['weekend'] = data_test['dayofweek'].apply(lambda x: 1 if (x>4)  else 0)
display(data_test.head())
data_test['Popularity'] = data_test.Popularity.apply(popularityConverting)
data_test['Popularity'] = data_test.Popularity.astype('float')
data_test['Likes'] = data_test.Likes.apply(popularityConverting)
data_test['Likes'] = data_test.Likes.astype('float')
X_testf = data_test.loc[:,['Comments', 'Likes', 'Popularity', 'Followers','weekend']]
#y_testf = data['Views']

In [None]:
X_testf.shape

In [None]:
X_testf= StandardScaler().fit(X_testf).transform(X_testf)
display(len(X_testf))
Y_pred = xgb_regressor.predict(X_testf)
display(len(Y_pred))

In [None]:
print(Y_pred)
display(len(label))

In [None]:
sub_df = pd.read_excel('./Sample_Submission.xlsx') 
submission = pd.DataFrame({
        "Unique_ID": label,
        "Views": Y_pred
    })
submission.to_excel('./Sample_Submission_Final.xlsx',index=False)

In [None]:
X_train

In [None]:
from keras import backend as K
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [None]:
# Importing the Keras libraries and packages
import keras
from keras.wrappers.scikit_learn import KerasClassifier,KerasRegressor
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU,PReLU,ELU
from keras.layers import Dropout


# Function to create model, required for KerasClassifier
def create_model():
    # create mode
    model = Sequential()
    model.add(Dense( units=12, input_dim=5, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dense( units=8, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dense(units = 1, kernel_initializer='he_uniform'))
    # Compile model
    model.compile(loss=root_mean_squared_error, optimizer='Adamax')
    return model
# create model
model = KerasRegressor(build_fn=create_model)

# Fitting the ANN to the Training set
model_history=model.fit(X_train, y_train,validation_split=0.30, batch_size = 10, epochs = 100)

In [None]:
ann_pred=model.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, ann_pred))
print("RMSE: %f" % (rmse))