In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

import nltk
from nltk.corpus import stopwords

In [54]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ukarj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [55]:
df = pd.read_csv('Train_Data.csv')
test_df = pd.read_csv('Test_Data.csv')

In [56]:
df.columns

Index(['text', 'author', 'controversiality', 'parent_text', 'parent_score',
       'parent_votes', 'parent_author', 'parent_controversiality', 'Score'],
      dtype='object')

In [57]:
test_df.columns

Index(['text', 'author', 'controversiality', 'parent_text', 'parent_score',
       'parent_votes', 'parent_author', 'parent_controversiality'],
      dtype='object')

In [58]:
df.isnull().sum()

text                       0
author                     0
controversiality           0
parent_text                0
parent_score               0
parent_votes               0
parent_author              0
parent_controversiality    0
Score                      0
dtype: int64

In [59]:
# Setting Score column as Y(Target) and dropping it from df
Y = df['Score']
df.drop(['Score'],axis=1,inplace=True)

In [60]:
df.shape,test_df.shape

((4999, 8), (1015, 8))

In [61]:
# Combining df and test_df for better NLP
df = df.append(test_df)

In [62]:
df

Unnamed: 0,text,author,controversiality,parent_text,parent_score,parent_votes,parent_author,parent_controversiality
0,i must be retarded i thought it meant con lawl...,['calantus'],0,"It's quite unfair to call Hillary Clinton a ""c...",245,245,Whisper,0
1,DOWNMODDED FOR IRRELEVANCE? ISN'T THAT HOW THI...,['Shadowrose'],0,upmodded for awesome kindness,32,32,b3mus3d,0
2,"THAT WAS SUPPOSED TO MEAN "" BY A PLACE WHERE P...",['NExusRush'],0,"What the hell does ""because its by a golf cour...",12,12,mr_jellyneck,0
3,I THOUGHT EVERYONE DID; ITS FUCKING DELICIOUS :\,['R0N_SWANS0N'],0,NICE TRY JENNIFER! I KNOW IT'S YOU AND I KNOW...,117,117,ometzo,0
4,"Great work, Zhesbe! I'd give you a raise but y...",['reddums'],0,"""HEY BOSS COME LOOK AT WHAT I DID!""",1933,1933,Zhesbe,0
...,...,...,...,...,...,...,...,...
1010,HTTP://I.IMGUR.COM/OZJD6.GIF,['GeneralWarts'],0,http://25.media.tumblr.com/tumblr_m35zo3hdID1r...,181,181,Octopuscabbage,0
1011,RIVEN OP,['Esperek'],0,being on fire would fit into his lore,6,6,Yvern,0
1012,"WOW, WELL FUCK MARK.",['adrenosine'],0,Oh he is!\n\nHis life is miles and miles ahead...,765,765,defeatedbird,0
1013,"ALLTSÅ JAG STÄLLDE EN FRÅGA, DU SÄGER ALLTSÅ A...",['vattenpuss'],0,För att förbättra informationen har Migrations...,13,13,rudolf_hesst,0


In [63]:
df.nunique()

text                       6000
author                     5158
controversiality              2
parent_text                6003
parent_score               1099
parent_votes               1099
parent_author              5309
parent_controversiality       2
dtype: int64

In [64]:
(df['parent_score']!=df['parent_votes']).sum(),(df['controversiality']!=df['parent_controversiality']).sum()

(0, 11)

In [65]:
#Since parent_score and parent_votes are same dropping parent_votes from the df
df.drop(['parent_votes'],axis=1,inplace=True)

- ## Text Processing

In [66]:
# Cleaning the data by removing punctuations, stopwords, etc

In [67]:
# defining  functions for cleaning text
def lower_it(text):
    '''Function to convert text to lower case'''
    return text.lower()

def remove_punctuation(text):
    '''Function to remove punctuations'''
    new_text = ''.join([i for i in text if i not in string.punctuation])
    return new_text

def remove_stopwords(text):
    '''Function to remove stopwords from text'''
    stop_words = nltk.corpus.stopwords.words('english')
    new_text = ' '.join([i for i in text.split() if i not in stop_words])
    return new_text

In [68]:
# Applying cleaning functions on text columns
df['text'] = df['text'].apply(lambda x: lower_it(x))
df['text'] = df['text'].apply(lambda x: remove_punctuation(x))
df['text'] = df['text'].apply(lambda x: remove_stopwords(x))
df['parent_text'] = df['parent_text'].apply(lambda x: lower_it(x))
df['parent_text'] = df['parent_text'].apply(lambda x: remove_punctuation(x))
df['parent_text'] = df['parent_text'].apply(lambda x: remove_stopwords(x))

In [69]:
df

Unnamed: 0,text,author,controversiality,parent_text,parent_score,parent_author,parent_controversiality
0,must retarded thought meant con lawl oh well work,['calantus'],0,quite unfair call hillary clinton cunt lacks d...,245,Whisper,0
1,downmodded irrelevance isnt works,['Shadowrose'],0,upmodded awesome kindness,32,b3mus3d,0
2,supposed mean place people undoubtedly snake b...,['NExusRush'],0,hell golf course anything think bunch rich whi...,12,mr_jellyneck,0
3,thought everyone fucking delicious,['R0N_SWANS0N'],0,nice try jennifer know know like baba ganoush,117,ometzo,0
4,great work zhesbe id give raise seem handled,['reddums'],0,hey boss come look,1933,Zhesbe,0
...,...,...,...,...,...,...,...
1010,httpiimgurcomozjd6gif,['GeneralWarts'],0,http25mediatumblrcomtumblrm35zo3hdid1r2dv2oo14...,181,Octopuscabbage,0
1011,riven op,['Esperek'],0,fire would fit lore,6,Yvern,0
1012,wow well fuck mark,['adrenosine'],0,oh life miles miles ahead mine im lonely neuro...,765,defeatedbird,0
1013,alltså jag ställde en fråga du säger alltså at...,['vattenpuss'],0,för att förbättra informationen har migrations...,13,rudolf_hesst,0


In [70]:
# removing punctuations from author column
df['author'] = df['author'].apply(lambda x: remove_punctuation(x))

In [71]:
tfid = TfidfVectorizer(max_features=100)

In [72]:
df_vectored = tfid.fit_transform(df['text'],df['parent_text']).toarray()

In [73]:
df_vectored.shape

(6014, 100)

In [74]:
df_vectored = pd.DataFrame(df_vectored,columns=tfid.get_feature_names())



In [75]:
# numerical columns from df
numeric_cols = ['controversiality','parent_score','parent_controversiality']

In [76]:
df.reset_index(drop=True,inplace=True)

In [77]:
# Combining the vectored column dataframe and numerical columns from original dataframe
data = pd.concat([df_vectored,df[numeric_cols]],axis=1)

In [78]:
# dividing the data in train and test as earlier 
train = data.iloc[:4999]
test = data.iloc[4999:]

In [79]:
train.shape,test.shape

((4999, 103), (1015, 103))

In [80]:
# Splitting the training data in further training and testing
x_train,x_test,y_train,y_test = train_test_split(train,Y,test_size=0.2,random_state=29)

In [87]:
rfr = RandomForestRegressor()

In [88]:
rfr.fit(x_train,y_train)

RandomForestRegressor()

In [89]:
svr = SVR()

In [90]:
svr.fit(x_train,y_train)

SVR()

In [91]:
rfr_train_pred = rfr.predict(x_train)
rfr_test_pred = rfr.predict(x_test)

svr_train_pred = svr.predict(x_train)
svr_test_pred = svr.predict(x_test)

In [92]:
print('\nRandom Forest Regressor')
print('\n\tTraining scores : ')
print('\t\tRMSE : ',np.sqrt(metrics.mean_squared_error(y_train,rfr_train_pred)))
print('\t\tR2 : ',metrics.r2_score(y_train,rfr_train_pred))
print('\n\tTesting scores : ')
print('\t\tRMSE : ',np.sqrt(metrics.mean_squared_error(y_test,rfr_test_pred)))
print('\t\tR2 : ',metrics.r2_score(y_test,rfr_test_pred))

print('\nSupport Vector Regressor')
print('\n\tTraining scores : ')
print('\t\tRMSE : ',np.sqrt(metrics.mean_squared_error(y_train,svr_train_pred)))
print('\t\tR2 : ',metrics.r2_score(y_train,svr_train_pred))
print('\n\tTesting scores : ')
print('\t\tRMSE : ',np.sqrt(metrics.mean_squared_error(y_test,svr_test_pred)))
print('\t\tR2 : ',metrics.r2_score(y_test,svr_test_pred))


Random Forest Regressor

	Training scores : 
		RMSE :  77.66240135260045
		R2 :  0.8476958254891407

	Testing scores : 
		RMSE :  186.59646648295058
		R2 :  0.1904420659782814

Support Vector Regressor

	Training scores : 
		RMSE :  179.93042733732293
		R2 :  0.1824780915724834

	Testing scores : 
		RMSE :  188.7194762867934
		R2 :  0.17191570528295008


In [95]:
# Predicting on test data
rfr_predict_test = rfr.predict(test)
svr_predict_test = svr.predict(test)

In [108]:
submission1 = pd.DataFrame(rfr_predict_test,columns=[('prediction')])

In [102]:
submission2 = pd.DataFrame(svr_predict_test,columns=[('prediction')])

In [112]:
submission1.to_csv('sub1.csv',index=False)
submission2.to_csv('sub2.csv',index=False)

In [109]:
submission1

Unnamed: 0,prediction
0,187.920000
1,23.217500
2,-0.148836
3,229.350000
4,15.180350
...,...
1010,141.892333
1011,-5.063141
1012,277.910000
1013,24.447270
