#                                                            Cleaning Datasets


# Import Libraries & Tools

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression 
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Import Dataframe using Pandas

In [3]:
ask_trans = pd.read_csv('/Users/khalildavis/Desktop/General Assembly Work/Submissions/Projects/project-3-master/Project 3 Submission/Data/asktrans_clean.csv')
ask_trans.head(5)

Unnamed: 0,title,auth,subreddit,text
0,What's the difference between transitioning wo...,SnooChocolates8273,asktransgender,When I look at a beautiful woman I see a beaut...
1,Did you also have this?,HoldTheStocks2,asktransgender,When I was like 10-14 I was searching all over...
2,How many times a week do you dilatate?,Kind_Lemon,asktransgender,"Hello!\n\nI am seven years post-surgical MtF, ..."
3,Coming out to my mother (again),mwnahas,asktransgender,Ive already done it about 1.5 years ago. She w...
4,Foot hurts is it hrt,Anastasia69Sanchez,asktransgender,I heard foot gets smaller . Idk if I'm getting...


# Dropping Columns & Duplicates

In [5]:
ask_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6289 entries, 0 to 6288
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      6289 non-null   object
 1   auth       6289 non-null   object
 2   subreddit  6289 non-null   object
 3   text       6289 non-null   object
dtypes: object(4)
memory usage: 196.7+ KB


In [None]:
#Code for removing uncessary columns

ask_trans.drop(columns=['Unnamed: 0', 'time'], inplace=True)

In [7]:
#Checked to see if thr columns were properly dropped

ask_trans.head(2)

Unnamed: 0,title,auth,subreddit,text
0,What's the difference between transitioning wo...,SnooChocolates8273,asktransgender,When I look at a beautiful woman I see a beaut...
1,Did you also have this?,HoldTheStocks2,asktransgender,When I was like 10-14 I was searching all over...


In [8]:
#Code for checking for duplicates with the df

ask_trans.duplicated().sum()

0

In [9]:
#Code to drop the duplicates

ask_trans.drop_duplicates(inplace=True)

In [10]:
#This shows me the duplicates were succesfully removed

ask_trans.duplicated().sum()

0

In [11]:
ask_trans.isna().sum()

title        0
auth         0
subreddit    0
text         0
dtype: int64

**Observation**:
I have decided to drop all the rows throughout the dataframe that contain 'NaN' & '[removed]' because compared to whole dataframe, the 'NaN', '[removed]' values are 6% ,3% respectively of the data which is a very low number.

In [12]:
#Code to removed any 'Nan" values
ask_trans.dropna(inplace=True)

In [13]:
#Code to see how many removed values I have in the df

ask_trans[ask_trans['text'] == '[removed]'].value_counts().sum()

0

In [14]:
#Code to to remove any rows that contain '[removed]'

ask_trans = ask_trans[ask_trans['text'] != '[removed]']

In [15]:
#Code to check the removed is deleted from df

ask_trans[ask_trans['text'] == '[removed]'].value_counts().sum()

0

In [16]:
#Checking to see how many text say '[deleted]'

ask_trans[ask_trans['text'] == '[deleted]'].value_counts().sum()

0

In [17]:
#Code to remove the '[deleted]'

ask_trans = ask_trans[ask_trans['text'] != '[deleted]']

In [18]:
#Check to see if it has been removed

ask_trans[ask_trans['text'] == '[deleted]'].value_counts().sum()

0

In [19]:
#This is the code to remove emojis within the df

ask_trans = ask_trans.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

In [20]:
ask_trans.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6289 entries, 0 to 6288
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      6289 non-null   object
 1   auth       6289 non-null   object
 2   subreddit  6289 non-null   object
 3   text       6289 non-null   object
dtypes: object(4)
memory usage: 245.7+ KB


**Observation**:
Since I removed the whole 'NaN' & '[removed]' out of my data, I have reduced the rows by which still gives me 6,298 rows to work with. As well removing any emojis within the dataframe.

In [21]:
ask_trans.to_csv('asktrans_clean.csv', index=False)

# Cleaning Rainbow Dataframe

**Observation**:
This cleaning process will be slightly different as there are emojis within the cells and I plan to removed them since they are characters and not words.

In [24]:
rainbow = pd.read_csv('/Users/khalildavis/Desktop/General Assembly Work/Submissions/Projects/project-3-master/Project 3 Submission/Data/rainbow_clean.csv')
rainbow.head(5)

Unnamed: 0,title,auth,subreddit,text
0,I created an Omegle Clone that actually helps ...,lolroofus,ainbow,
1,My boyfriend came out to me as bi,glassAdvertiser,ainbow,"Hi everyone, I really need some advice! \n\nMy..."
2,A celebration of a queen,Max_E_Mas,ainbow,If you are not up to date on Jeopardy I sugges...
3,"Thank you, old queer people",Wonderful_Toes,ainbow,Thank you for the chance you've given so many ...
4,Did any of you actively avoid LGBTQ things/peo...,ivegotyoupegged,ainbow,I've had this issue for the better part of a y...


In [None]:
#Checking to see if there are any missing values or any uncessary columns to remove

rainbow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990 entries, 0 to 6989
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6990 non-null   int64 
 1   title       6990 non-null   object
 2   auth        6990 non-null   object
 3   time        6990 non-null   int64 
 4   subreddit   6990 non-null   object
 5   text        2992 non-null   object
dtypes: int64(2), object(4)
memory usage: 327.8+ KB


In [None]:
#Using the code to drop the 'Unnamed: 0' & 'time' columns

rainbow.drop(columns=['Unnamed: 0', 'time'], inplace=True)

In [None]:
#Check to see the columns were properly dropped
rainbow.head(2)

Unnamed: 0,title,auth,subreddit,text
0,I created an Omegle Clone that actually helps ...,lolroofus,ainbow,
1,My boyfriend came out to me as bi,glassAdvertiser,ainbow,"Hi everyone, I really need some advice! \n\nMy..."


In [None]:
rainbow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990 entries, 0 to 6989
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      6990 non-null   object
 1   auth       6990 non-null   object
 2   subreddit  6990 non-null   object
 3   text       2992 non-null   object
dtypes: object(4)
memory usage: 218.6+ KB


In [None]:
#Code for checking to see if there are duplicates

rainbow.duplicated().sum()

130

**Observation:**
There are 130 duplicates that need to be removed from the dataframe. The duplicates only account for 1.9% of my data. Removing the duplicates will not harm my data. I still have 6,860 to work with.

In [None]:
#Code to removed the duplicates from the dataframe

rainbow.drop_duplicates(inplace=True)

In [None]:
#Checked to see if the duplicates were properly dropped

rainbow.shape

(6860, 4)

In [None]:
#The duplicates have been removed

rainbow.duplicated().sum()

0

In [None]:
rainbow[rainbow['text'] == '[removed]'].value_counts().sum()

757

**Observation**:
Seeing that the 'text' column has 2,992 non-null cells, the percentage of 43% states to me that this a problem I must correct. Then I noticed that there are 757 ['removed'] which accounts for 25% of the 43% of the non-null cells. 25% is the word '[removed]' and 18% is 'Nan'.

Instead of pulling more data, I have decided to fill the rows with an empty string and delete any rows that contain the word '[removed]'.  

In [None]:
# Filling the 'Nan' values
rainbow['text'].fillna(' ', inplace=True)

In [None]:
rainbow.shape

(6860, 4)

In [None]:
#Check to see if there are any '[deleted]' within the df

rainbow[rainbow['text'] == '[deleted]'].value_counts().sum()

74

In [None]:
#To remove the '[deleted]'
rainbow = rainbow[rainbow['text'] != '[deleted]']

**Observation**:
This action lets me know that I was successful in filling the rows that had 'Nan in them although there are still words such as "[removed]" that need to be taken out. 

In [None]:
# The code to remove the '[removed]' in text column

rainbow = rainbow[rainbow['text'] != '[removed]']

In [None]:
#Checked to see if the '[removed]' has been deleted

rainbow[rainbow['text'] == '[removed]'].value_counts().sum()

0

In [None]:

rainbow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6029 entries, 0 to 6989
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      6029 non-null   object
 1   auth       6029 non-null   object
 2   subreddit  6029 non-null   object
 3   text       6029 non-null   object
dtypes: object(4)
memory usage: 235.5+ KB


**Observation:**
I have noticed that this dataframe contains a heavy amount of emojis especially within the 'title' &  'text' columns. I have found the code to remove them with the help of 'StackOverflow' and save it to the rainbow_clean dataframe.

In [None]:
#This code came from StackOverflow to help remove emojis

rainbow = rainbow.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

In [None]:
rainbow.to_csv('rainbow_clean.csv', index=False)

# Merge Clean Dataframes together

In [None]:
combo_clean = pd.concat([ask_trans, rainbow])
combo_clean.head()

Unnamed: 0,title,auth,subreddit,text
1,What's the difference between transitioning wo...,SnooChocolates8273,asktransgender,When I look at a beautiful woman I see a beaut...
2,Did you also have this?,HoldTheStocks2,asktransgender,When I was like 10-14 I was searching all over...
3,How many times a week do you dilatate?,Kind_Lemon,asktransgender,"Hello!\n\nI am seven years post-surgical MtF, ..."
4,Coming out to my mother (again),mwnahas,asktransgender,Ive already done it about 1.5 years ago. She w...
5,Foot hurts is it hrt,Anastasia69Sanchez,asktransgender,I heard foot gets smaller . Idk if I'm getting...


In [None]:
combo_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12318 entries, 1 to 6989
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      12318 non-null  object
 1   auth       12318 non-null  object
 2   subreddit  12318 non-null  object
 3   text       12318 non-null  object
dtypes: object(4)
memory usage: 481.2+ KB


In [None]:
combo_clean.to_csv('combo_clean.csv', index=False)

In [None]:
combo_clean['is_asktransgender'] = [1 if i == 'asktransgender' else 0 for i in combo_clean['subreddit']]
combo_clean.head()

Unnamed: 0,title,auth,subreddit,text,is_asktransgender
1,What's the difference between transitioning wo...,SnooChocolates8273,asktransgender,When I look at a beautiful woman I see a beaut...,1
2,Did you also have this?,HoldTheStocks2,asktransgender,When I was like 10-14 I was searching all over...,1
3,How many times a week do you dilatate?,Kind_Lemon,asktransgender,"Hello!\n\nI am seven years post-surgical MtF, ...",1
4,Coming out to my mother (again),mwnahas,asktransgender,Ive already done it about 1.5 years ago. She w...,1
5,Foot hurts is it hrt,Anastasia69Sanchez,asktransgender,I heard foot gets smaller . Idk if I'm getting...,1


In [None]:
combo_clean.to_csv('combo_clean_binary.csv', index=False)

In [None]:
combo_clean.isna().sum()

title                0
auth                 0
subreddit            0
text                 0
is_asktransgender    0
dtype: int64

# Data Dictionary

|Feature|Type|Dataset|Description|
|---|---|---|---|
|r/asktransgender|object|rainbow_clean df|Subreddit that focuses on transgender advice|
|r/ainbow| object| ask_trans_clean df | Subreddit that focuses on lgbt advice
|combo_clean|object|combo_clean df|Combining the two subreddits to make a full df