## Importing packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import numpy as np
import glob
import re

## Importing csv files

In [2]:
tmdbDataSet = pd.read_csv('tmdb_5000_movies.csv')
avocadoDataSet = pd.read_csv('avocado.csv')
del avocadoDataSet['Unnamed: 0'] # removing unwanted column from dataset

## Extract basic information about dataset

In [None]:
# Returning top 5 rows
print(tmdbDataSet.head())

In [None]:
# Returning last row
print(tmdbDataSet.tail(1))

In [8]:
# Getting info about dataset like clolumn types, count of not null values
print(tmdbDataSet.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
budget                  4803 non-null int64
genres                  4803 non-null object
homepage                1712 non-null object
id                      4803 non-null int64
keywords                4803 non-null object
original_language       4803 non-null object
original_title          4803 non-null object
overview                4800 non-null object
popularity              4803 non-null float64
production_companies    4803 non-null object
production_countries    4803 non-null object
release_date            4802 non-null object
revenue                 4803 non-null int64
runtime                 4801 non-null float64
spoken_languages        4803 non-null object
status                  4803 non-null object
tagline                 3959 non-null object
title                   4803 non-null object
vote_average            4803 non-null float64
vote_count              4803 non-null 

In [None]:
# Another way to check datatypes of the columns
print(tmdbDataSet.dtypes)

In [None]:
# Get all the columns in dataset
print(tmdbDataSet.columns)

In [None]:
# Get the shape of dataset
print(tmdbDataSet.shape)

In [None]:
# Get information like count, mean, std, min, 25%, 50%, 75% and max of dataset
print(tmdbDataSet.describe())

In [None]:
# To calulate 55% of the dataset
print(tmdbDataSet.vote_count.quantile([0.55]))

In [None]:
# output is same as that of above cell
temp = tmdbDataSet.sort_values(['vote_count']).head(2641)
print(temp.vote_count.tail(1))

In [None]:
# To calculate median of the dataset
print(tmdbDataSet.median())

## Analyzing null values
As we observed that out of 4803 rows only 1712 are the not null values for the homepage column. Let's investigate it further and check why these column have null values

In [None]:
tmdbDataSet[tmdbDataSet['homepage'].isnull()]

In [None]:
# Check the unique values of the column 
print(tmdbDataSet.production_countries.value_counts())

In [None]:
print(tmdbDataSet.spoken_languages.value_counts(dropna=False))

In [None]:
print(tmdbDataSet.release_date.value_counts(dropna=False))

## Converting data type of the column
As we have seen above that status column have only 3 distinct values. So it would be feasible to convert the column data type to category

In [None]:
# Converting datatype of column to category
tmdbDataSet['status'] = tmdbDataSet['status'].astype('category')
print(tmdbDataSet.status.dtypes)

In [None]:
# Command used to convert data type to numeric
# Note that below command will give an error while executing as string cannot be converted to numeric
tmdbDataSet['status'] = pd.to_numeric(tmdbDataSet['status'])

In [None]:
# To forcefully convert column to numeric. It will insert NaN for the values it cannot convert to integer
pd.to_numeric(tmdbDataSet['status'], errors='coerce')

# Melting and pivoting of dataset

In [None]:
# Melting of dataset
avocadoDataSet_melt_4770 = pd.melt(avocadoDataSet, id_vars=['Date', 'AveragePrice', 'Total Volume', '4046', '4225',
       'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year',
       'region'], var_name='avocados_variety', value_name='4770')

avocadoDataSet_melt_4225 = pd.melt(avocadoDataSet_melt_4770, id_vars=['Date', 'AveragePrice', 'Total Volume', '4046',
       'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year',
       'region'], var_name='avocados_variety', value_name='4225')

avocadoDataSet_melt_4046 = pd.melt(avocadoDataSet_melt_4225, id_vars=['Date', 'AveragePrice', 'Total Volume',
       'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year',
       'region'], var_name='avocados_variety', value_name='4046')

print(avocadoDataSet_melt_4046.head())

In [None]:
# Pivoting of dataset
avocadoDataSet_melt_4046['4046'] = pd.to_numeric(avocadoDataSet_melt_4046['4046'], errors='coerce')
avocadoDataSet_pivot = avocadoDataSet_melt_4046.pivot_table(index=['Date'], 
columns='type', values='4046', aggfunc=np.mean)

print(avocadoDataSet_pivot.head())

In [None]:
# Pivoting of dataset
# If we also want to see the variety along with the type
avocadoDataSet_melt_4046['4046'] = pd.to_numeric(avocadoDataSet_melt_4046['4046'], errors='coerce')
avocadoDataSet_pivot = avocadoDataSet_melt_4046.pivot_table(index=['Date', 'avocados_variety'], 
columns='type', values='4046', aggfunc=np.mean)

print(avocadoDataSet_pivot.head())

In [None]:
# Pivoting of dataset without specifying values
avocadoDataSet_pivot = avocadoDataSet_melt_4046.pivot_table(index=['Date'], 
columns='type')

print(avocadoDataSet_pivot.head())

In [None]:
# Pivoting of dataset with margin
avocadoDataSet_pivot_margin = avocadoDataSet_melt_4046.pivot_table(index=['Date'], 
columns='type', margins=True)

print(avocadoDataSet_pivot_margin.tail())

# Splitting columns

We can even split a single column into multiple column.

In [None]:
month = {1: 'JAN', 2: 'FEB', 3: 'MAR', 
         4: 'APR',5: 'MAY', 6: 'JUNE',7: 'JULY', 8: 'AUG', 9: 'SEP', 10: 'OCT', 11: 'NOV', 12: 'DEC' }
avocadoDataSet['Month'] = avocadoDataSet.Date.str.split('-').str.get(1)
avocadoDataSet['Month'] = pd.to_numeric(avocadoDataSet['Month'], errors='coerce')
avocadoDataSet['Month'] = avocadoDataSet['Month'].map(month)
print(avocadoDataSet.head())

## Reading multiple files in Python

In [None]:
pattern = '*.csv'
csv_files = glob.glob(pattern)
dataset = pd.read_csv(csv_files[1])
print(dataset)

In [None]:
print(tmdbDataSet.info())
print(avocadoDataSet.info())

avocadoDataSet.drop_duplicates()
tmdbDataSet.drop_duplicates()

print(tmdbDataSet.info())
print(avocadoDataSet.info())

# Since there are no duplicates rows in both of the datsets
# Therefore we can't see any chnage in rows returned

## Regular Expressions and Apply function in Python

In [None]:
prog = re.compile('\d{3}-\d{3}-\d{4}')
result = prog.match('123-456-7890')
print(bool(result))

result = prog.match('1123-456-7890')
print(bool(result))

In [None]:
matches = re.findall('\d+', 'Out of 3 houses, 2 houses have 4 people statying.')
print(matches)

# Observation
# The above code will find out all the numbers in the string

In [None]:
def encode_avocado_type(value):
    if value == 'conventional':
        return 1
    elif value == 'organic':
        return 0

avocadoDataSet['type_coded'] = avocadoDataSet.type.apply(encode_avocado_type)

print(avocadoDataSet.head())

In [None]:
avocadoDataSet['Date_replace'] = avocadoDataSet['Date'].apply(lambda x: x.replace('-', '/'))

avocadoDataSet['Date_find'] = avocadoDataSet['Date'].apply(lambda x: re.findall('\d', x))

print(avocadoDataSet.head())

## Handling missing data in the column

In tmdbDataSet, for column runtime there are two rows which doesn't have any data. We can either delete these two rows or we can add data to these two rows.

In [27]:
# dropping rows where column is not having any data
print(tmdbDataSet.shape)

# remove the row if any column have null value
print(tmdbDataSet.dropna(how='any').shape)

# remove the row if all column have null value
print(tmdbDataSet.dropna(how='all').shape)

# returns dataset where 19 columns have non N/A values
#print(tmdbDataSet.dropna(thresh=19).info())

# drop rows where we have N/A value for a specific column
print(tmdbDataSet.dropna(subset=['runtime']).shape)

(4803, 20)
(1493, 20)
(4803, 20)
(4801, 20)


In [None]:
# Two rows which doesn't have data for runtime column
print(tmdbDataSet[tmdbDataSet['runtime'].isnull()])

In [None]:
runtime_mean = tmdbDataSet.runtime.mean()
tmdbDataSet['runtime'] = tmdbDataSet['runtime'].fillna(runtime_mean)

print(tmdbDataSet.info())

In [None]:
# Forward fill and backward fill
tmdbDataSet['homepage_ffill'] = tmdbDataSet['homepage'].fillna(method='ffill')
tmdbDataSet['homepage_bfill'] = tmdbDataSet['homepage'].fillna(method='bfill')
print(tmdbDataSet[tmdbDataSet['homepage_ffill'].isnull()])