In [1]:
import pandas as pd
import numpy as np

## » Reading from Data Structures

### » from dict

In [2]:
d = {'col1': range(5), 'col2': range(5,10)}

type(d)

dict

In [3]:
df = (
    pd.DataFrame(
        data = d, # input the data - mentioning 'data =' is optional... check below cell for alt implementation
        index = ['A', 'B', 'C', 'D', 'E'], # optional - default: RangeIndex starts with 0, or you can define the row index values
        columns = ['col1', 'col2'], # same as index, if tuple, then the 'key' from key:value pair is going to be the column
        dtype = np.int64, # pd.DataFrame only allows for setting one dtype for whole table
        copy = True
    )
)

print(df)

   col1  col2
A     0     5
B     1     6
C     2     7
D     3     8
E     4     9


In [4]:
# alternatively to convert dtypes, we can do this after creation

df.astype({'col1':'int64', 'col2': 'string'})

print(df.dtypes,f"\n")

# or chain it panda creation like

df_dtypes = (
    pd.DataFrame(d).astype({'col1':'int64', 'col2':'string'})
)

print(df_dtypes.dtypes)

col1    int64
col2    int64
dtype: object 

col1             int64
col2    string[python]
dtype: object


#### »» dict - to rows

In [5]:
# from_dict() gives more control over how you insert your dataset

In [6]:
data = {'index': [('a', 'b'), ('a', 'c')],
        'columns': [('x', 1), ('y', 2)],
        'data': [[1, 3], [2, 4]],
        'index_names': ['n1', 'n2'],
        'column_names': ['z1', 'z2']}

print(pd.DataFrame.from_dict(data, orient='tight'))

z1     x  y
z2     1  2
n1 n2      
a  b   1  3
   c   2  4


In [7]:
df_rows = pd.DataFrame.from_dict(
    data = d,
    orient = 'index'
)
print(df, f"\n")
print(df_rows)

   col1  col2
A     0     5
B     1     6
C     2     7
D     3     8
E     4     9 

      0  1  2  3  4
col1  0  1  2  3  4
col2  5  6  7  8  9


#### »» tight dict - Multi-Index df

In [8]:
tightDict = {'index': [
                ('Vector', 'type-A'),('Vector', 'type-B'),
                ('Square', 'type-C'),('Square', 'type-D')
            ],
              'columns': [
                  ('A', '$'),('A', '#'),
                  ('B', '$'),('B', '#'),
                  ('C', '$'),('C', '#')
              ],
              'data': [
                  [0, 1, 2, 3, 4, 5],
                  [6, 7, 8, 9, 10, 11],
                  [12, 13, 14, 15, 16, 17],
                  [18, 19, 20, 21, 22, 23]
                  ],
              'index_names': ['group', 'type'],
              'column_names': ['Category', 'Value']
        }

In [9]:
# lets try a direct approach
try: 
    dfRaw = pd.DataFrame.from_dict(tightDict)
    
except Exception as e:
    print(e)

All arrays must be of the same length


In [10]:
dictNew = { 
    'abs': [1,2,3],
    'sec': [3,4,5]
}

dfTry = pd.DataFrame(dictNew) # for a simple dict it works, but not for a tight dict

print(dfTry)

   abs  sec
0    1    3
1    2    4
2    3    5


In [11]:
# hence the usage of orient = 'tight' in from_dict()
# if the dict does not have all keys for index, columns, and data; it will throw error - with optional index_names, column_names
dfTight = pd.DataFrame.from_dict(
    tightDict, orient = 'tight'
)

print(dfTight)

Category        A       B       C    
Value           $   #   $   #   $   #
group  type                          
Vector type-A   0   1   2   3   4   5
       type-B   6   7   8   9  10  11
Square type-C  12  13  14  15  16  17
       type-D  18  19  20  21  22  23


In [12]:
tightDictNew = dfTight.to_dict(orient = 'tight') ## to write back a tight dict

tightDictNew

{'index': [('Vector', 'type-A'),
  ('Vector', 'type-B'),
  ('Square', 'type-C'),
  ('Square', 'type-D')],
 'columns': [('A', '$'),
  ('A', '#'),
  ('B', '$'),
  ('B', '#'),
  ('C', '$'),
  ('C', '#')],
 'data': [[0, 1, 2, 3, 4, 5],
  [6, 7, 8, 9, 10, 11],
  [12, 13, 14, 15, 16, 17],
  [18, 19, 20, 21, 22, 23]],
 'index_names': ['group', 'type'],
 'column_names': ['Category', 'Value']}

### » from lists

In [13]:
height = [155, 180, 170, 175, 178]
weight = [55, 70, 80, 82, 98]
names = ['krill', 'sean', 'megh', 'luke', 'brian']

#### »» list to columns - default approach

In [14]:
classHeight = pd.DataFrame( # creating from a single list
    height, 
    index = names)

print(classHeight)

         0
krill  155
sean   180
megh   170
luke   175
brian  178


In [15]:
classPopln = pd.DataFrame( # combining two lists would practically be like converting into a tuple
    {'height':height, 'weight': weight},
    index = names
)

print(classPopln)
print(classPopln.dtypes)

       height  weight
krill     155      55
sean      180      70
megh      170      80
luke      175      82
brian     178      98
height    int64
weight    int64
dtype: object


#### »» using zip()

In [16]:
listRows = list(zip(height, weight))
print(listRows, f"\n")

df_zip = pd.DataFrame(
    listRows,
    index = names
)

print(df_zip)

[(155, 55), (180, 70), (170, 80), (175, 82), (178, 98)] 

         0   1
krill  155  55
sean   180  70
megh   170  80
luke   175  82
brian  178  98


#### »» list of lists - rows data into dataFrame

In [17]:
### using multidimensional lists

classPII = [
    ["saran", 1992, 'A'],
    ["karan", 1994, 'B'],
    ["maran", 2001, 'C']
]

classPD = pd.DataFrame(
    classPII,
    columns = ["names", "dob", "id"],
    index = ["Pavuluri", "Singh", "Dayan"]
    )

print(classPD)

          names   dob id
Pavuluri  saran  1992  A
Singh     karan  1994  B
Dayan     maran  2001  C


#### »» Multi-Index

In [18]:
colIndexTuple = [
    ('A', '$'), ('A','#'), ('B', "$"), ('B', "#"), ('C', "$"), ('C', "#")
] # creating an index tuple pair

colIndexTuple 

[('A', '$'), ('A', '#'), ('B', '$'), ('B', '#'), ('C', '$'), ('C', '#')]

In [19]:
colMultiIndex = pd.MultiIndex.from_tuples(
    colIndexTuple, names = ['Category', 'value']
) # converting the tuple into a multiindex using pd.MultIndex.from_tuples()

colMultiIndex

MultiIndex([('A', '$'),
            ('A', '#'),
            ('B', '$'),
            ('B', '#'),
            ('C', '$'),
            ('C', '#')],
           names=['Category', 'value'])

In [20]:
first = list(range(6))
second = list(range(6,12))
third = list(range(12,18))
fourth = list(range(18,24))

mulData = [first, second, third, fourth]

mulData

[[0, 1, 2, 3, 4, 5],
 [6, 7, 8, 9, 10, 11],
 [12, 13, 14, 15, 16, 17],
 [18, 19, 20, 21, 22, 23]]

In [21]:
dfMulti = pd.DataFrame(
    mulData,
    columns = colMultiIndex
)

print(dfMulti)

Category   A       B       C    
value      $   #   $   #   $   #
0          0   1   2   3   4   5
1          6   7   8   9  10  11
2         12  13  14  15  16  17
3         18  19  20  21  22  23


In [22]:
# now lets add multi-indexing to rows as well

rowsTuple = [
    ('Vector', 'type-A'), ('Vector', 'type-B'),
    ('Square', 'type-C'), ('Square', 'type-D')
]

rowsTuple

[('Vector', 'type-A'),
 ('Vector', 'type-B'),
 ('Square', 'type-C'),
 ('Square', 'type-D')]

In [23]:
rowMultiIndex = pd.MultiIndex.from_tuples(
    rowsTuple,
    names = ['group', 'type']
)

rowMultiIndex

MultiIndex([('Vector', 'type-A'),
            ('Vector', 'type-B'),
            ('Square', 'type-C'),
            ('Square', 'type-D')],
           names=['group', 'type'])

In [24]:
dfMulti = pd.DataFrame(
    mulData,
    columns = colMultiIndex,
    index = rowMultiIndex
)

print(dfMulti)

Category        A       B       C    
value           $   #   $   #   $   #
group  type                          
Vector type-A   0   1   2   3   4   5
       type-B   6   7   8   9  10  11
Square type-C  12  13  14  15  16  17
       type-D  18  19  20  21  22  23


In [25]:
dfMulti.to_dict(orient = 'tight')

{'index': [('Vector', 'type-A'),
  ('Vector', 'type-B'),
  ('Square', 'type-C'),
  ('Square', 'type-D')],
 'columns': [('A', '$'),
  ('A', '#'),
  ('B', '$'),
  ('B', '#'),
  ('C', '$'),
  ('C', '#')],
 'data': [[0, 1, 2, 3, 4, 5],
  [6, 7, 8, 9, 10, 11],
  [12, 13, 14, 15, 16, 17],
  [18, 19, 20, 21, 22, 23]],
 'index_names': ['group', 'type'],
 'column_names': ['Category', 'value']}

### » from list of Tuples

In [26]:
tuple_data = [
    (101, 'Alice', 'Engineering'),
    (102, 'Bob', 'Marketing'),
    (103, 'Charlie', 'Engineering')
]

dfTuple = pd.DataFrame(tuple_data)

dfTuple

Unnamed: 0,0,1,2
0,101,Alice,Engineering
1,102,Bob,Marketing
2,103,Charlie,Engineering


### » Numpy Arrays

In [27]:
# Just a rejoinder
# List & Array - built in to python [while list is mostly used, opt for it - if there's a need 
#                                      for memory efficiecnt storage of homogenous numeric type data]

# Numpy & Series - packages for data science
# Numpy  - multi dimensional                     - integer indexed   
#        - homogenous in standard mode           - optimized for numeric computations (mathermatical and scientific)
# numpy does allow for structured forms where heterogenity is allowed, but the space & time efficiencies will be compromised

# Series - one direcitonal                       - user-defined index 
#        - heterogenous capable                  - built on top of numpy but addl functionality and flexible indexing add overhead

# Series is better for cleaning, filtering, grouping, combining etc... also handles missing data well 

#### »» Standard - homogeneous

In [28]:
arr = np.array([
    [3,4],
    [33,44],
    [133,144]
]) 

df = pd.DataFrame(
    arr,
    columns = ['A', 'B']
)

print(df)

     A    B
0    3    4
1   33   44
2  133  144


#### »» Structured - Heterogeneous

In [29]:
classMarks = np.array(
    [
        (1, 'A', '1st attmept'),
        (2, 'A', '1st attempt'),
        (3, 'B', '2nd attempt')
    ],
    dtype = [
        ('id', 'i4'), ('grade', 'U3'), ('attempt', 'U20')
    ]
)

classMarks

array([(1, 'A', '1st attmept'), (2, 'A', '1st attempt'),
       (3, 'B', '2nd attempt')],
      dtype=[('id', '<i4'), ('grade', '<U3'), ('attempt', '<U20')])

### » from pandas.Series

In [30]:
prodSales = pd.Series(
    data = [100,200,300,400,500],
    index = ['me', 'myself', 'i', 'her', 'him'],                # default would be RangeIndex, if not defined
    name = 'pronouns'
)

prodSales

me        100
myself    200
i         300
her       400
him       500
Name: pronouns, dtype: int64

In [31]:
dfProd = pd.DataFrame(                      # bringing pd.Series into a DataFrame
    prodSales
)

dfProd

Unnamed: 0,pronouns
me,100
myself,200
i,300
her,400
him,500


In [32]:
newProdSales = pd.Series(
    data = [111,222,333,444,555],
    index = ['me', 'myself', 'i', 'her', 'him'],                # default would be RangeIndex, if not defined
    name = 'xxx'
)


In [33]:
dfProd = pd.concat(
    [
        dfProd, 
        newProdSales
        ],
    axis = 1
    )

dfProd

Unnamed: 0,pronouns,xxx
me,100,111
myself,200,222
i,300,333
her,400,444
him,500,555


## » Reading from files

### » read_csv

In [34]:
from io import StringIO

csvData = "col1, col2\n1,A\n2,B\n3,C"       # standard separated by ,

csvDF = pd.read_csv(StringIO(csvData))

print(csvDF)

   col1  col2
0     1     A
1     2     B
2     3     C


In [35]:
csvData = "col1; col2\n1;A\n2;B\n3;C"

csvDF = pd.read_csv(
    StringIO(csvData),
    sep = ';'                   # because our data uses ; - like \t or |
)

print(csvDF)

   col1  col2
0     1     A
1     2     B
2     3     C


In [36]:
noHeaderCSV = "1,A\n2,B\n3,C"

csvDF = pd.read_csv(
    StringIO(noHeaderCSV),
    header = None,                                  # if data lacks header
    names = ['ID', 'Category']                      # if no header, name your own column names
)

print(csvDF)

   ID Category
0   1        A
1   2        B
2   3        C


In [37]:
IDdata = "id,name,grade\n1,saran,A\n2,aishu,X\n3,durr,X"

csvDF = pd.read_csv(
    StringIO(IDdata),
    index_col = 'id'                    # use the available column as index
)

print(csvDF, "\n")
print(csvDF.dtypes)

     name grade
id             
1   saran     A
2   aishu     X
3    durr     X 

name     object
grade    object
dtype: object


In [38]:
cityList = """
id,city,state,popln
1,seattle,WA,90009
2,portland,OR,80008
3,Anchorage,AK,30002
"""

csvDF = pd.read_csv(
    StringIO(cityList),
    index_col = 'id',                                   
    usecols = ['id','city','popln'],                              # lets get only city and popln while using id as index
    dtype = {                                                     # set dtypes
        'id':'Int8', 
        'city':str, 
        'popln': 'int64'
        }           
)

print(csvDF, "\n")
print(csvDF.dtypes)

         city  popln
id                  
1     seattle  90009
2    portland  80008
3   Anchorage  30002 

city     object
popln     int64
dtype: object


Final Notes:

Best practices:
1. While loading large datasets:  
»»» **specifying dtype** is good because it can become an issue in large datasets  
»»» **ID's as string** good to take id's as strings rather than int, because something like 09100 would be changed to 9100  
»»» **unique strings in column as Category** i.e. say something like {'product category': 'category'}  
»»» **load necessary columns**only using usecols = ['a', 'b']  
»»» **load a sample set of rows** only using nrows = 1000  
»»» **large files in chunks** chunksize = 10000 --> returns 'TextFileReader' iterator that yields each chunk as separate, temporary DataFrame  
»»» **parse dates when you load** ex: parse_dates = {'event_data': ['year','month','day']} -- better than .to_datetime() later-on  


### » read_excel

In [39]:
import openpyxl

In [43]:
try:
    xDF = pd.read_excel('jobs.xlsx', sheet_name = 'jobs') # alternatively can sheet_name = 0 (saying its first sheet in the workbook)
except Exception as e:
    print(e)

xDF.head()

Unnamed: 0,Company,Job Title,Category,Sub Category,Job Link,Job Description,Saved At
0,Holman,Business Intelligence Developer II,BIE,Risky,https://www.linkedin.com/jobs/view/4259283922/...,"About the job\n\nHolman is a family-owned, glo...","6/30/2025, 2:26:31 AM"
1,The College Board,"Data Analyst, AWS Analytics",Analyst,Risky,https://www.linkedin.com/jobs/view/4259283160/...,About the job\n\nCollege Board - Technology Di...,"6/30/2025, 2:29:17 AM"
2,"AmTrust Financial Services, Inc.",Analytics Analyst II,BIE,Medium,https://www.linkedin.com/jobs/view/4257913602/...,About the job\n\nOverview\n\nAmTrust Financial...,"6/30/2025, 2:40:56 AM"
3,University of Montana Foundation,Programmer Analyst,Analyst,Low,https://www.linkedin.com/jobs/view/4255252626/...,About the job\n\nAt the University of Montana ...,"6/30/2025, 3:08:14 AM"
4,Gen,Digital Product Analyst,Analyst,Medium,https://www.linkedin.com/jobs/view/4205437869/...,About the job\n\nAbout The Role\n\nWe are hiri...,"6/30/2025, 3:27:17 AM"


In [45]:
try:
    xDF_new = pd.read_excel(
        'jobs.xlsx',
        sheet_name = [                                           # multiple sheets can be brought in like a dict of DataFrames
            'jobs',
            'Sheet1'
            ],
        # engine = 'openpyxl',                                   # openpyxl is default. 'xlrd' for xls file. 'pyxlsb' for .xlsb file
        usecols = ['Company', 'Category', 'Sub Category'],       # usecols - same as read_csv()
        # header = 0,                                            # sometimes header might be row 2 as excel format weird
        # skiprows = 0,                                          # skip top rows where some formatting or metadata exists
        # skipfooter = 0                                         # excels comes with signatures, so to skip them
        ) # all sheets
except Exception as e:
    print(e)

xDF_new

{'jobs':                               Company       Category Sub Category
 0                              Holman            BIE        Risky
 1                   The College Board        Analyst        Risky
 2    AmTrust Financial Services, Inc.            BIE       Medium
 3    University of Montana Foundation        Analyst          Low
 4                                 Gen        Analyst       Medium
 ..                                ...            ...          ...
 229                        Quartet AI  Data Engineer     moderate
 230                               ATC        Analyst     moderate
 231                               ICF        Analyst     moderate
 232                             Huron  Data Engineer     moderate
 233                             Huron        Analyst     moderate
 
 [234 rows x 3 columns],
 'Sheet1':                                               Company Category Sub Category
 0                                              Holman      BIE        Ris

Use same best practices used in read_csv()

In [46]:
xDF_new['jobs']                     # use this to get specific dataframe from the dict

Unnamed: 0,Company,Category,Sub Category
0,Holman,BIE,Risky
1,The College Board,Analyst,Risky
2,"AmTrust Financial Services, Inc.",BIE,Medium
3,University of Montana Foundation,Analyst,Low
4,Gen,Analyst,Medium
...,...,...,...
229,Quartet AI,Data Engineer,moderate
230,ATC,Analyst,moderate
231,ICF,Analyst,moderate
232,Huron,Data Engineer,moderate


In [47]:
for sheets, df in xDF_new.items():
    print(df)

                              Company       Category Sub Category
0                              Holman            BIE        Risky
1                   The College Board        Analyst        Risky
2    AmTrust Financial Services, Inc.            BIE       Medium
3    University of Montana Foundation        Analyst          Low
4                                 Gen        Analyst       Medium
..                                ...            ...          ...
229                        Quartet AI  Data Engineer     moderate
230                               ATC        Analyst     moderate
231                               ICF        Analyst     moderate
232                             Huron  Data Engineer     moderate
233                             Huron        Analyst     moderate

[234 rows x 3 columns]
                                              Company Category Sub Category
0                                              Holman      BIE        Risky
1                               

In [48]:
# df.from_records?                        # nice method, but not needed -- pd.DataFrame() does all this does

### » read_parquet

In [50]:
parqDF = pd.read_parquet(
            path = 'titanic.parquet',
            engine = 'pyarrow',                  # default is auto == pyarrow, it thats n/a then fastparquet gets chosen 
            columns = ['PassengerId', 'Survived', 'Sex', 'Age', 'Pclass'],          # filter by columns
            filters = [
                ('Survived', '==', 1),
                ('Age', '<', 30)                        # engine="pyarrow" also needed to be specified for filter to work
                ]                                       # filter by row values, ( ==, =, !=, <, <=, >, >=, in, not in)
        )   

print(parqDF)

     PassengerId  Survived     Sex   Age  Pclass
0              3         1  female  26.0       3
1              9         1  female  27.0       3
2             10         1  female  14.0       2
3             11         1  female   4.0       3
4             23         1  female  15.0       3
..           ...       ...     ...   ...     ...
151          875         1  female  28.0       2
152          876         1  female  15.0       3
153          881         1  female  25.0       2
154          888         1  female  19.0       1
155          890         1    male  26.0       1

[156 rows x 5 columns]


### » read_sql

In [67]:
from sqlalchemy import create_engine


try:
    engine = create_engine('postgresql://toofanmacpro@localhost:5432/testdb')
    print(engine,f"created\n")
    
    with engine.connect() as connection:
        df = pd.read_sql('SELECT * FROM "Employee";', connection)
        print("read success")
        print(df)
        connection.close()
except Exception as e:
    print(f"error occurred {e}")

Engine(postgresql://toofanmacpro@localhost:5432/testdb) created

read success
   index  id  salary
0      0   1     100
1      1   2     200
2      2   3     300


In [68]:
try: 
    engine = create_engine('postgresql://toofanmacpro@localhost:5432/testdb')
    print(engine, f"created\n")
    
    with engine.connect() as connection:
        df_table = pd.read_sql_table('Employee', connection)
        print(df_table)
        connection.close()
except Exception as e:
    print(f"error occurred {e}")

Engine(postgresql://toofanmacpro@localhost:5432/testdb) created

   index  id  salary
0      0   1     100
1      1   2     200
2      2   3     300
