In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

### Dataset 1: overwatch

In [30]:
# Read in dataset 1 >>>

dataset1_csv = './datasets/overwatch_raw.csv'
dataset1 = pd.read_csv(dataset1_csv)
dataset1.drop(columns=['Unnamed: 0'], inplace=True)

# Dataframe >>>
df_1 = pd.DataFrame(dataset1)

# Add constants >>>
df_1 = df_1[['subreddit', 'selftext', 'title']]
df_1['target'] = 1

# test >>>
ttt = df_1.loc[(df_1['selftext'] == '[removed]') | (df_1['selftext'] == '[deleted]')]
print(f'To remove >>> {len(ttt)}')

# Report >>>
print(f'Shape >>> {df_1.shape}')
print(f'Duplicates >>> {df_1.duplicated().sum()}')

To remove >>> 272
Shape >>> (2000, 4)
Duplicates >>> 45


In [31]:
# Clean dataset 1 >>>
print(df_1['subreddit'][0])

# Remove [removed] & [deleted] >>>
df_1['selftext'] = df_1['selftext'].replace('[removed]', '')
df_1['selftext'] = df_1['selftext'].replace('[deleted]', '')
df_1['selftext'] = df_1['selftext'].fillna('')

# test >>>
ttt = df_1.loc[(df_1['selftext'] == '[removed]') | (df_1['selftext'] == '[deleted]')]
print(f'To remove >>> {len(ttt)}')

# Combine columns >>>
df_1['text'] = df_1['selftext'].astype(str) + df_1['title'].astype(str)
df_1 = df_1[['subreddit', 'text', 'target']]    

# Report >>>
print(f'Shape >>> {df_1.shape}')
print(f'Duplicates >>> {df_1.duplicated().sum()}')

Overwatch
To remove >>> 0
Shape >>> (2000, 3)
Duplicates >>> 52


### Dataset 2: league of legends

In [32]:
## Read in dataset 2 >>>

# To remove >>> 
dataset2_csv = './datasets/leagueoflegends_raw.csv'
dataset2 = pd.read_csv(dataset2_csv)
dataset2.drop(columns=['Unnamed: 0'], inplace=True)

# Dataframe >>>
df_2 = pd.DataFrame(dataset2)

# Add constants >>>
df_2 = df_2[['subreddit', 'selftext', 'title']]
df_2['target'] = 0

# test >>>
ttt = df_2.loc[(df_2['selftext'] == '[removed]') | (df_2['selftext'] == '[deleted]')]
print(f'To remove >>> {len(ttt)}')

# Report >>>
print(f'Shape >>> {df_2.shape}')
print(f'Duplicates >>> {df_2.duplicated().sum()}')

To remove >>> 407
Shape >>> (2000, 4)
Duplicates >>> 51


In [33]:
# Clean dataset 2 >>>
print(df_2['subreddit'][0])

# To remove >>>
df_2['selftext'] = df_2['selftext'].replace('[removed]', '')
df_2['selftext'] = df_2['selftext'].replace('[deleted]', '')
df_2['selftext'] = df_2['selftext'].fillna('')

# test >>>
ttt = df_2.loc[(df_2['selftext'] == '[removed]') | (df_2['selftext'] == '[deleted]')]
print(f'To remove >>> {len(ttt)}')

# Combine columns >>>
df_2['text'] = df_2['selftext'].astype(str) + df_2['title'].astype(str)
df_2 = df_2[['subreddit', 'text', 'target']]    

# Report >>>
print(f'Shape >>> {df_2.shape}')
print(f'Duplicates >>> {df_2.duplicated().sum()}')

leagueoflegends
To remove >>> 0
Shape >>> (2000, 3)
Duplicates >>> 82


In [34]:
# combine Dataframes >>>

df_final =[df_1, df_2]
final = pd.concat(df_final,axis=0,ignore_index=True)

In [35]:
final.head() 

Unnamed: 0,subreddit,text,target
0,Overwatch,Solo queue is bad for my mental health...,1
1,Overwatch,I’ve always wondered about Sym’s arm and have ...,1
2,Overwatch,"my 14 yo. sister in law painted this, and i th...",1
3,Overwatch,"my 14 yo. sister in law painted this picture, ...",1
4,Overwatch,that is allMy replays list is completely red,1


In [36]:
final.tail()

Unnamed: 0,subreddit,text,target
3995,leagueoflegends,Game started and everyone had an invisible cha...,0
3996,leagueoflegends,So I have to win 2 of 3 games to be promoted. ...,0
3997,leagueoflegends,I broke hail of blades,0
3998,leagueoflegends,I was thinking of what champs in LoL best repr...,0
3999,leagueoflegends,Steve' secret message revealed?,0


In [37]:
final.shape

(4000, 3)

In [38]:
final.to_csv('./datasets/final_clean.csv')

In [39]:
# remove duplicates