# Victor's Pandas Cheat Sheet for Data Science

### Imports

In [None]:
# Imports:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
print("💥Imports firing💥")


# Pandas

### Load data

In [None]:
# Read data from file

df1 = pd.read_json('yourFile.json') # read a json file with correct path
df2 = pd.read_csv('yourFile.csv') # read a csv file with correct path
df3 = pd.read_excel('yourFile.xlsx') # read a excel file with correct path

In [None]:
# Make your own dataframe

df = pd.DataFrame({
  'col1': ['Item1', 'Item2', 'Item3', 'Item4'], # items in first column
  'col2': [1, 2, 3, 4], # items in second column
  'col3': [1.1, 2.2, 3.3, 4.4] # items in third column
})

#### Examine data

In [None]:
# Examine the data

df.head()  # first five rows
df.tail()  # last five rows
df.shape  # number of rows and columns
df.columns  # column names
df.describe()  # statistical summary
df.info()  # overview of data

### Selecting and filtering data

In [None]:
df['column_name']  # select a column
df[['col1', 'col2']]  # select multiple columns
df[df['column'] > value]  # rows where column is greater than value

df.iloc[0]  # Access data by row (index) using iloc
df.loc[0]  # Access data by index label using loc

### Handling missing data

In [None]:
df.isnull().sum()                   # number of missing values in each column
df.isnull()                         # Returns a DataFrame or Series of boolean values indicating missing values.

df.dropna()                         # drop rows with missing values

df.fillna(value)                    # fill missing values with a specified value
df.fillna(df.mean(), inplace=True)  # Fill missing values with the average of the column

### Manipulating data

In [None]:
df.sort_values('col1', inplace=True) # sort by column
df.rename(columns={'col1': 'wordsList'}, inplace=True)  # rename column
df['new_col'] = df['col2'] + df['col3']  # create new column
df.head()

# Machine learning

### Imports and preparation

In [None]:
# Import libraries

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X = df.drop('target_column', axis=1)  # replace 'target_column' with your target column name
y = df['target_column'] 

#### Scale if needed

In [None]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns

### Split training and testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 80:20 split

# General Python