## {{cookiecutter.project_name}}

{{cookiecutter.description}}

### Data Sources
- file1 : Description of where this file came from

### Changes
- {% now 'utc', '%m-%d-%Y' %} : Started project

In [None]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import numpy as np

### File Locations

In [None]:
today = datetime.today()
in_file = Path.cwd() / "data" / "raw" / "FILE1"
summary_file = Path.cwd() / "data" / "processed" / f"summary_{today:%b-%d-%Y}.pkl"
summary_file_excel = Path.cwd() / "data" / "processed" / f"summary_{today:%b-%d-%Y}.xlsx"

In [None]:
df = pd.read_csv(in_file)

### Column Cleanup

- Remove all leading and trailing spaces
- Rename the columns for consistency.

In [None]:
# https://stackoverflow.com/questions/30763351/removing-space-in-dataframe-python
df.columns = [x.strip() for x in df.columns]

In [None]:
# remove first empty index column
df.drop(df.columns[0], axis=1, inplace=True)

In [None]:
cols_to_rename = {'col1': 'New_Name'}
df.rename(columns=cols_to_rename, inplace=True)

### Clean Up Data Types

In [None]:
df.dtypes

In [None]:
# if need to change type of column to other type
df['PROD_CODE'] = df['PROD_CODE'].astype(str)

In [None]:
# convert date to correct format
df['DATE'] = pd.to_datetime(df['DATE'], dayfirst=True)

### Data Manipulation

In [None]:
# if we need to group by consultant code and get all products into one row
# providing one row per consultant
# df in example contains two columns: DISTRIBUTOR_NUMBER AND PROD_CODE with one row for each product for consultant
products = products.groupby(['DISTRIBUTOR_NUMBER']).apply(lambda x: ', '.join(x['PROD_CODE'])).reset_index()

In [None]:
# example of updating column value depending on some condition
df['LANG'] = np.where(df['LANG']=='Eesti', 'EST', 'RUS')

In [None]:
# multiple merging of many dataframes
df = pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(structure, program_plus, on='DISTRIBUTOR_NUMBER', how='left'),
                                         wp1, on='DISTRIBUTOR_NUMBER', how='left'),
                                wp2, on='DISTRIBUTOR_NUMBER', how='left'),
                       qual_sponsored, on='DISTRIBUTOR_NUMBER', how='left'),
              grouped_products, on = 'DISTRIBUTOR_NUMBER', how='left'
     )

### Save output file into processed directory

Save a file in the processed directory that is cleaned properly. It will be read in and used later for further analysis.

Other options besides pickle include:
- feather
- msgpack
- parquet

In [None]:
df.to_pickle(summary_file)

In [None]:
df.to_excel(summary_file_excel)