In [27]:
import pandas as pd
import numpy as np

In [28]:
data = pd.read_csv('spending.csv', header = None)
data.head(10)

Unnamed: 0,0
0,2/11
1,68330 Feb rent
2,770 charge
3,68330 March rent
4,770 charge
5,880 lunch torisho
6,3580 plants rack amazon
7,7980 air purifier amazon
8,2/12
9,750 McDonald’s


In [29]:
data.columns = data.iloc[0]
data

Unnamed: 0,2/11
0,2/11
1,68330 Feb rent
2,770 charge
3,68330 March rent
4,770 charge
...,...
598,"4000, social dinner, tres hermanos"
599,"600, social snacks, what the dickens"
600,"700, taxi, to Ebisu"
601,7/27


In [30]:
data.rename(columns={'2/11': 'item'}, inplace=True)
data

Unnamed: 0,item
0,2/11
1,68330 Feb rent
2,770 charge
3,68330 March rent
4,770 charge
...,...
598,"4000, social dinner, tres hermanos"
599,"600, social snacks, what the dickens"
600,"700, taxi, to Ebisu"
601,7/27


In [31]:
# check data

print(data.shape)
print(data.isna().sum())

(603, 1)
0
item    0
dtype: int64


In [32]:
# create date column 

def date_column(date_str):
    if date_str.replace('/', '').isdigit() == True:
        return date_str
    
data['Date'] = data['item'].apply(date_column)
data

Unnamed: 0,item,Date
0,2/11,2/11
1,68330 Feb rent,
2,770 charge,
3,68330 March rent,
4,770 charge,
...,...,...
598,"4000, social dinner, tres hermanos",
599,"600, social snacks, what the dickens",
600,"700, taxi, to Ebisu",
601,7/27,7/27


In [23]:
# function to fill in the dates for the Nones. forward fill!
data['Date'] = data['Date'].ffill()
data.head(20)

Unnamed: 0,item,Date
0,2/11,2/11
1,68330 Feb rent,2/11
2,770 charge,2/11
3,68330 March rent,2/11
4,770 charge,2/11
5,880 lunch torisho,2/11
6,3580 plants rack amazon,2/11
7,7980 air purifier amazon,2/11
8,2/12,2/12
9,750 McDonald’s,2/12


In [35]:
# how many rows have just the date and no items:

(data['item'] == data['Date']).sum()

167

In [24]:
# clear out the date-only rows
# boolean mask
data = data[data['item'] != data['Date']]
data.shape

(436, 2)

In [231]:
# convert the dates to datetime objects

from datetime import datetime

def add_year(date_str):
    return datetime.strptime(f'{date_str}/2024', '%m/%d/%Y')

data['Date'] = data['Date'].apply(add_year)
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Date'] = data['Date'].apply(add_year)


Unnamed: 0,item,Date
1,68330 Feb rent,2024-02-11
2,770 charge,2024-02-11
3,68330 March rent,2024-02-11
4,770 charge,2024-02-11
5,880 lunch torisho,2024-02-11
...,...,...
596,"2595, groceries, gyomu super",2024-07-25
598,"4000, social dinner, tres hermanos",2024-07-26
599,"600, social snacks, what the dickens",2024-07-26
600,"700, taxi, to Ebisu",2024-07-26


### next: parse out amount, item, location

In [217]:
def split_item(item):
    if ',' in item:
        return item.split(',')
    else: 
        return item.split()

data['item'] = data['item'].apply(split_item)    
data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['item'] = data['item'].apply(split_item)


Unnamed: 0,item,Date
1,"[68330, Feb, rent]",2024-02-11
2,"[770, charge]",2024-02-11
3,"[68330, March, rent]",2024-02-11
4,"[770, charge]",2024-02-11
5,"[880, lunch, torisho]",2024-02-11
...,...,...
596,"[2595, groceries, gyomu super]",2024-07-25
598,"[4000, social dinner, tres hermanos]",2024-07-26
599,"[600, social snacks, what the dickens]",2024-07-26
600,"[700, taxi, to Ebisu]",2024-07-26


In [219]:
# parse into columns
def separate_columns(parts):
    parts = [part.strip() for part in parts]
    if len(parts) == 1:
        return 0, parts[0], None
    elif len(parts) == 2:
        return parts[0], parts[1], None, None
    elif len(parts) == 3:
        return parts[0], parts[1], parts[2], None
    elif len(parts) == 4:
        return parts[0], parts[1], parts[2], parts[3]

parsed_columns = data['item'].apply(separate_columns)
print(parsed_columns)
print(parsed_columns.tolist())
parsed_df = pd.DataFrame(parsed_columns.tolist(), columns=['Price', 'Item', 'Location', 'Details'])
df = pd.concat([data, parsed_df], axis=1)
parsed_df.head(50)

1                          (68330, Feb, rent, None)
2                         (770, charge, None, None)
3                        (68330, March, rent, None)
4                         (770, charge, None, None)
5                       (880, lunch, torisho, None)
                           ...                     
596            (2595, groceries, gyomu super, None)
598      (4000, social dinner, tres hermanos, None)
599    (600, social snacks, what the dickens, None)
600                     (700, taxi, to Ebisu, None)
602                             (0, Nothing!, None)
Name: item, Length: 436, dtype: object
[('68330', 'Feb', 'rent', None), ('770', 'charge', None, None), ('68330', 'March', 'rent', None), ('770', 'charge', None, None), ('880', 'lunch', 'torisho', None), ('3580', 'plants', 'rack', 'amazon'), ('7980', 'air', 'purifier', 'amazon'), ('750', 'McDonald’s', None, None), ('1814', 'groceries', 'top', None), ('283', 'snacks', 'famima', None), ('1900', 'shoes', 'workman', None), ('540'

Unnamed: 0,Price,Item,Location,Details
0,68330.0,Feb,rent,
1,770.0,charge,,
2,68330.0,March,rent,
3,770.0,charge,,
4,880.0,lunch,torisho,
5,3580.0,plants,rack,amazon
6,7980.0,air,purifier,amazon
7,750.0,McDonald’s,,
8,1814.0,groceries,top,
9,283.0,snacks,famima,


In [220]:
df

Unnamed: 0,item,Date,Price,Item,Location,Details
1,"[68330, Feb, rent]",2024-02-11,770,charge,,
2,"[770, charge]",2024-02-11,68330,March,rent,
3,"[68330, March, rent]",2024-02-11,770,charge,,
4,"[770, charge]",2024-02-11,880,lunch,torisho,
5,"[880, lunch, torisho]",2024-02-11,3580,plants,rack,amazon
...,...,...,...,...,...,...
419,,NaT,2720,doc,Tama clinic,
424,,NaT,795,snacks,welpark,
429,,NaT,500,lunch,summit,
433,,NaT,600,social snacks,what the dickens,


In [221]:
parsed_df

Unnamed: 0,Price,Item,Location,Details
0,68330,Feb,rent,
1,770,charge,,
2,68330,March,rent,
3,770,charge,,
4,880,lunch,torisho,
...,...,...,...,...
431,2595,groceries,gyomu super,
432,4000,social dinner,tres hermanos,
433,600,social snacks,what the dickens,
434,700,taxi,to Ebisu,


## buncha experimentation

In [222]:
# data['Date']

def find_date(column):
    for i in range(len(column)):
        if column[i].replace('/', '').isdigit() == True:
            date = column[i]

In [223]:
def insert_date(column, ):
    while row.replace('/', '').isdigit() == False:
        

SyntaxError: incomplete input (4087099836.py, line 3)

In [None]:
'2/12'.isdigit()



In [None]:
'2/12'.replace('/', '')