# CSV FILE CLEAN & ADD FEATURES

## Import Libraries

In [1]:
import pandas as pd

## CSV File Clean & Add Features
- Puts all records into a single row and removes empty lines
- Removes non-alphanumeric characters from the columns 'question_content' & 'response_content' leaving only spaces between the words
- Create 'year', 'month' & 'date' columns using 'question_sent' column

In [2]:
# Load dataset in chunks due to size of the csv file
chunks = pd.read_csv('../data/b0cd514b-b9cc-4972-a0c2-c91726e6d825.csv',
                     dtype={'question_user_gender': str, 'response_user_gender': str}, # Removes mixed dtypes error message
                     chunksize=100000
                    )
df = pd.DataFrame()

In [3]:
# Clean data in the columns 'question_content' & 'response_content' 
# Create new features from date
# Concatenate chunks to a single dataframe
content_columns = ['question_content', 'response_content']
for chunk in chunks:
    # Keep only alpanumeric characters and single space between words
    chunk.replace({content_columns[0]: {r'[^a-zA-Z0-9 ]+' : ' ' }}, inplace=True, regex=True)
    chunk.replace({content_columns[1]: {r'[^a-zA-Z0-9 ]+' : ' ' }}, inplace=True, regex=True)

    # Convert multiple whitespace into single space
    chunk.replace({content_columns[0]: {r'\s+' : ' ' }}, inplace=True, regex=True)
    chunk.replace({content_columns[1]: {r'\s+' : ' ' }}, inplace=True, regex=True)

    # Remove spaces from beggining and end of string
    chunk[content_columns] = chunk[content_columns].apply(lambda x: x.str.strip())

    # Create year, month and day values from 'question_sent' column
    chunk['year'] = chunk.question_sent.apply(lambda x: int(x[:4]))
    chunk['month'] = chunk.question_sent.apply(lambda x: pd.Timestamp(x[:19]).month_name())
    chunk['day'] = chunk.question_sent.apply(lambda x: pd.Timestamp(x[:19]).day_name())
    
    df = pd.concat([df,chunk], axis=0)

## Export To CSV File

In [4]:
df.to_csv('../data/cleaned_updated_data.csv',index=False)