In [2]:
import pandas as pd
import matplotlib.pyplot as plt

dates = ["202204", "202205","202206","202207","202208","202209"
         ,"202210","202211","202212","202301","202302","202303",]
for date in dates:
    print(date + " ###############################################")
    # Imports the cvs data and displays some info
    #date = "202211" # YYYYMM identifies the cvs file
    path = date + "-divvy-tripdata.csv"
    data = pd.read_csv(path)
    print(data.head(), "\n")
    data.info()

    # Eliminates the empty rows
    print("\n Eliminates the empty rows")
    data.dropna(inplace=True)
    #data.info()

    # Eliminates the duplicate rows
    print("Eliminates duplicate rows")
    data.drop_duplicates(inplace=True)
    #data.info()
    # print(data)

    # Turns the entries of the columns "started_at" and "ended_at" into datetime format
    print("""Turns the entries of the columns "started_at" and "ended_at" into datetime format
    """)
    data["started_at"] = pd.to_datetime(data["started_at"])
    data["ended_at"] = pd.to_datetime(data["ended_at"])
    #data.info()
    #print("\n")
    #data[["started_at","ended_at"]].head()

    # Shows all the values of the columns "rideable_type", "member_casual",
    # so I can check that they are valid
    print("""values of the columns "rideable_type", "member_casual" """)
    for i in ["rideable_type", "member_casual"]: 
        data_member = data.groupby(i)
        entries = data_member.groups.keys()
        print(i, " : ",entries)
        print(" ")

    # Shows the max and min of all the data, 
    # so I can check that the boundaries of the data make sense.
    print("Shows the MAX and MIN of all the data")
    data_max = pd.DataFrame(
                            [data.max(), 
                             data.min()]
                            , index = ["Max", "Min"]).transpose()
    print(data_max)

    # Add the column "ride_length" 
    # as subtraction between "ended_at" and "started_at"
    print("\n Add the column ride_length")
    data["ride_length"] = data["ended_at"] - data["started_at"]
    #data.info()

    # Add the column "day_of_week" that shows the day of the week 
    # with 0 as Monday and 6 as Sunday 
    print("Add the column day_of_week")
    data["day_of_week"] = data["started_at"].dt.dayofweek
    #data.info()
    data.groupby("day_of_week").groups.keys()

    # Check if there are negative or zero ride_length and removes the rows
    print("Removes the rows with negative ride length ")
    zero_time = pd.Timedelta("0 days 00:00:00")
    right_data = data[data.ride_length > zero_time]
    data = right_data

    # Shows max and min for the new columns,
    # so I can check their boundaries
    print("Shows max and min for the new columns")
    print(pd.DataFrame(
        [data[["ride_length","day_of_week"]].max(),
         data[["ride_length","day_of_week"]].min()]
        , index=["Max", "Min"]).transpose())

    # Saves the cleaned data in a csv file
    print("\n Saves the cleaned data in a csv file")
    data.to_csv("cleaned_" + date + ".csv")
    print(data.info())

202204 ###############################################
            ride_id  rideable_type           started_at             ended_at  \
0  3564070EEFD12711  electric_bike  2022-04-06 17:42:48  2022-04-06 17:54:36   
1  0B820C7FCF22F489   classic_bike  2022-04-24 19:23:07  2022-04-24 19:43:17   
2  89EEEE32293F07FF   classic_bike  2022-04-20 19:29:08  2022-04-20 19:35:16   
3  84D4751AEB31888D   classic_bike  2022-04-22 21:14:06  2022-04-22 21:23:29   
4  5664BCF0D1DE7A8B  electric_bike  2022-04-16 15:56:30  2022-04-16 16:02:11   

          start_station_name start_station_id          end_station_name  \
0     Paulina St & Howard St              515   University Library (NU)   
1  Wentworth Ave & Cermak Rd            13075     Green St & Madison St   
2       Halsted St & Polk St     TA1307000121     Green St & Madison St   
3  Wentworth Ave & Cermak Rd            13075  Delano Ct & Roosevelt Rd   
4       Halsted St & Polk St     TA1307000121   Clinton St & Madison St   

  end_station

In [22]:
# Shows max and min for the new columns,
# so I can check their boundaries
print("\n Shows max and min for the new columns")
boundaries = pd.DataFrame(
    [data[["ride_length","day_of_week"]].max(),
     data[["ride_length","day_of_week"]].min()]
    , index=["Max", "Min"]).transpose()
boundaries


 Shows max and min for the new columns


Unnamed: 0,Max,Min
ride_length,1 days 00:53:14,-1 days +23:02:01
day_of_week,6,0


In [32]:
zero_time = pd.Timedelta("0 days 00:00:00")
if boundaries.Min.ride_length < zero_time:
    right_data = data[data.ride_length > zero_time]
right_data

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_length,day_of_week
0,BCC66FC6FAB27CC7,electric_bike,2022-11-10 06:21:55,2022-11-10 06:31:27,Canal St & Adams St,13011,St. Clair St & Erie St,13016,41.879401,-87.639848,41.894345,-87.622798,member,0 days 00:09:32,3
1,772AB67E902C180F,classic_bike,2022-11-04 07:31:55,2022-11-04 07:46:25,Canal St & Adams St,13011,St. Clair St & Erie St,13016,41.879255,-87.639904,41.894345,-87.622798,member,0 days 00:14:30,4
2,585EAD07FDEC0152,classic_bike,2022-11-21 17:20:29,2022-11-21 17:34:36,Indiana Ave & Roosevelt Rd,SL-005,St. Clair St & Erie St,13016,41.867888,-87.623041,41.894345,-87.622798,member,0 days 00:14:07,0
3,91C4E7ED3C262FF9,classic_bike,2022-11-25 17:29:34,2022-11-25 17:45:15,Indiana Ave & Roosevelt Rd,SL-005,St. Clair St & Erie St,13016,41.867888,-87.623041,41.894345,-87.622798,member,0 days 00:15:41,4
4,709206A3104CABC8,classic_bike,2022-11-29 17:24:25,2022-11-29 17:42:51,Indiana Ave & Roosevelt Rd,SL-005,St. Clair St & Erie St,13016,41.867888,-87.623041,41.894345,-87.622798,member,0 days 00:18:26,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337727,03005DDB0F2F28E4,classic_bike,2022-11-09 14:22:29,2022-11-09 14:25:18,Clifton Ave & Armitage Ave,TA1307000163,Sheffield Ave & Webster Ave,TA1309000033,41.918216,-87.656936,41.921540,-87.653818,member,0 days 00:02:49,2
337731,B0B4E85DA43A9194,classic_bike,2022-11-22 16:57:53,2022-11-22 17:31:29,Franklin St & Jackson Blvd,TA1305000025,Sheffield Ave & Wrightwood Ave,TA1309000023,41.877708,-87.635321,41.928712,-87.653833,casual,0 days 00:33:36,1
337732,8D148DD47B59530B,classic_bike,2022-11-06 13:04:05,2022-11-06 13:13:33,Michigan Ave & Ida B Wells Dr,TA1305000010,Shedd Aquarium,15544,41.876243,-87.624426,41.867226,-87.615355,casual,0 days 00:09:28,6
337733,0D1170BA18FD33D1,classic_bike,2022-11-06 09:41:29,2022-11-06 15:17:17,Halsted St & 18th St,13099,Shedd Aquarium,15544,41.857506,-87.645991,41.867226,-87.615355,casual,0 days 05:35:48,6
