In [None]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/reddit/Train_Data.xls")
print(df.shape)

#EDA

df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.columns.values

In [None]:
cols = ['text', 'author', 'controversiality', 'parent_text', 'parent_score', 'parent_votes', 
        'parent_author', 'parent_controversiality', 'Score']
for col in cols:
    print(col,':',df[col].nunique())

In [None]:
df.drop(['parent_votes'], axis= 1, inplace=True)

In [None]:
# handling categorical  data
cat_cols = ['text','author','parent_text','parent_author']
for col in df[cat_cols]:
    df[col] = df[col].str.lower()
    df[col] = df[col].str.strip()
df.head()

In [None]:
# Text pre processing
# Removing punctuations

for col in df[cat_cols]:
    df[col] = df[col].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
df.head()

In [None]:
#tokenization
# Defining functions
def text_tokens(row):
    text = row['text']
    tokens = word_tokenize(text)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words
df['text_tokens'] = df.apply(text_tokens, axis=1)

def parent_text_tokens(row):
    parent_text = row['parent_text']
    tokens = word_tokenize(parent_text)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words
df['parent_text_tokens'] = df.apply(parent_text_tokens, axis=1)

In [None]:
df.head()

In [None]:
# Remove stop words
stop_words = stopwords.words('english')

column_tokens = ['text_tokens','parent_text_tokens']

for col in column_tokens:
    df[col] = df[col].apply(lambda x: ' '.join([w for w in x if w not in (stop_words)]))
df.head()

In [None]:
#lemetizing
wl = WordNetLemmatizer()
for col in column_tokens:
    df[col] = df[col].apply(lambda x: [wl.lemmatize(str(word)) for word in x.split()])
df.head()

In [None]:
df['text']= df['text_tokens'].apply(lambda x: ' '.join(x))
df['parent_text']= df['parent_text_tokens'].apply(lambda x: ' '.join(x))
df.drop(['text_tokens', 'parent_text_tokens'], axis=1, inplace= True)

In [None]:
df.head()

In [None]:
# Vectorizing data
cv = CountVectorizer()
text_cv = cv.fit_transform(df['text']).toarray()
text_cv = pd.DataFrame(text_cv, columns=cv.get_feature_names())

tfidv = TfidfVectorizer(max_features=50,min_df=1,max_df=0.7)
text_tf = tfidv.fit_transform(df['text']).toarray()
text_tf = pd.DataFrame(text_tf, columns=tfidv.get_feature_names())


In [None]:
text_cv.shape

In [None]:
text_tf.shape

In [None]:
#numeric data
numeric_data = df[['controversiality', 'parent_score', 'parent_controversiality','Score']]
# Correlation of numerical features
numeric_data.corr()

In [None]:
sns.heatmap(numeric_data.corr() , annot = True)

In [None]:
# Defining features and Target
X = numeric_data.drop('Score', axis=1)
X = pd.concat([text_tf, X], axis=1)
y = df['Score']

In [None]:
X

In [None]:
# Initializing Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 7)

In [None]:
#Model building
#Linear regression
linear = LinearRegression()
linear.fit(X_train, y_train)

#Predicting values
y_pred_linear = linear.predict(X_test)

In [None]:
print('RMSE for linear regression is: ', np.sqrt(mean_squared_error(y_test, y_pred_linear)))

In [None]:
# KNN
# Hyperparameter for n_neighbors
param_grid = {'n_neighbors': np.arange(1, 100)}
knn = KNeighborsRegressor()
knn_cv = GridSearchCV(knn, param_grid, cv=10)
knn_cv.fit(X_train, y_train)
knn_cv.best_params_

In [None]:
#cross validation showed us that 83 is the best value for n_neighbours
knn = KNeighborsRegressor(n_neighbors = 83)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

In [None]:
print('RMSE for KNN regression is: ', np.sqrt(mean_squared_error(y_test, y_pred_knn)))

In [None]:
# Random Forest
param_grid = {'n_estimators': np.arange(1, 50), 'max_depth': np.arange(1, 50)}
Randomreg = RandomForestRegressor()
Randomreg_cv = GridSearchCV(Randomreg, param_grid, cv=10)
Randomreg_cv.fit(X_train, y_train)
Randomreg_cv.best_params_

In [None]:
#Cross validation showed us
Randomreg = RandomForestRegressor(n_estimators=9, max_depth=3, max_features='auto')
Randomreg.fit(X_train, y_train)

y_pred_random = Randomreg.predict(X_test)