## Importing packages

In [137]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import numpy as np

## Importing csv files

In [169]:
tmdbDataSet = pd.read_csv('tmdb_5000_movies.csv')
avocadoDataSet = pd.read_csv('avocado.csv')
del avocadoDataSet['Unnamed: 0'] # removing unwanted column from dataset

## Extract basic information about dataset

In [35]:
# Returning top 5 rows
print(tmdbDataSet.head())

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

In [None]:
# Returning last row
print(tmdbDataSet.tail(1))

In [None]:
# Getting info about dataset like clolumn types, count of not null values
print(tmdbDataSet.info())

In [None]:
# Another way to check datatypes of the columns
print(tmdbDataSet.dtypes)

In [109]:
# Get all the columns in dataset
print(tmdbDataSet.columns)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


In [None]:
# Get the shape of dataset
print(tmdbDataSet.shape)

In [None]:
# Get information like count, mean, std, min, 25%, 50%, 75% and max of dataset
print(tmdbDataSet.describe())

In [None]:
# To calulate 55% of the dataset
print(tmdbDataSet.vote_count.quantile([0.55]))

In [None]:
# output is same as that of above cell
temp = tmdbDataSet.sort_values(['vote_count']).head(2641)
print(temp.vote_count.tail(1))

In [None]:
# To calculate median of the dataset
print(tmdbDataSet.median())

## Analyzing null values
As we observed that out of 4803 rows only 1712 are the not null values for the homepage column. Let's investigate it further and check why these column have null values

In [None]:
tmdbDataSet[tmdbDataSet['homepage'].isnull()]

In [None]:
# Check the unique values of the column 
print(tmdbDataSet.production_countries.value_counts())

In [88]:
print(tmdbDataSet.spoken_languages.value_counts(dropna=False))

NaN    4803
Name: spoken_languages, dtype: int64


In [None]:
print(tmdbDataSet.release_date.value_counts(dropna=False))

## Converting data type of the column
As we have seen above that status column have only 3 distinct values. So it would be feasible to convert the column data type to category

In [None]:
# Converting datatype of column to category
tmdbDataSet['status'] = tmdbDataSet['status'].astype('category')
print(tmdbDataSet.status.dtypes)

In [None]:
# Command used to convert data type to numeric
# Note that below command will give an error while executing as string cannot be converted to numeric
tmdbDataSet['status'] = pd.to_numeric(tmdbDataSet['status'])

In [None]:
# To forcefully convert column to numeric. It will insert NaN for the values it cannot convert to integer
pd.to_numeric(tmdbDataSet['status'], errors='coerce')

# Melting and pivoting of dataset

In [149]:
# Melting of dataset
avocadoDataSet_melt_4770 = pd.melt(avocadoDataSet, id_vars=['Date', 'AveragePrice', 'Total Volume', '4046', '4225',
       'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year',
       'region'], var_name='avocados_variety', value_name='4770')

avocadoDataSet_melt_4225 = pd.melt(avocadoDataSet_melt_4770, id_vars=['Date', 'AveragePrice', 'Total Volume', '4046',
       'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year',
       'region'], var_name='avocados_variety', value_name='4225')

avocadoDataSet_melt_4046 = pd.melt(avocadoDataSet_melt_4225, id_vars=['Date', 'AveragePrice', 'Total Volume',
       'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year',
       'region'], var_name='avocados_variety', value_name='4046')

print(avocadoDataSet_melt_4046.head())

         Date  AveragePrice  Total Volume  Total Bags  Small Bags  Large Bags  \
0  2015-12-27          1.33      64236.62     8696.87     8603.62       93.25   
1  2015-12-20          1.35      54876.98     9505.56     9408.07       97.49   
2  2015-12-13          0.93     118220.22     8145.35     8042.21      103.14   
3  2015-12-06          1.08      78992.15     5811.16     5677.40      133.76   
4  2015-11-29          1.28      51039.60     6183.95     5986.26      197.69   

   XLarge Bags          type  year  region avocados_variety     4046  
0          0.0  conventional  2015  Albany             4046  1036.74  
1          0.0  conventional  2015  Albany             4046   674.28  
2          0.0  conventional  2015  Albany             4046    794.7  
3          0.0  conventional  2015  Albany             4046     1132  
4          0.0  conventional  2015  Albany             4046   941.48  


# Splitting columns

We can even split a single column into multiple column.

In [174]:
month = {1: 'JAN', 2: 'FEB', 3: 'MAR', 
         4: 'APR',5: 'MAY', 6: 'JUNE',7: 'JULY', 8: 'AUG', 9: 'SEP', 10: 'OCT', 11: 'NOV', 12: 'DEC' }
avocadoDataSet['Month'] = avocadoDataSet.Date.str.split('-').str.get(1)
avocadoDataSet['Month'] = pd.to_numeric(avocadoDataSet['Month'], errors='coerce')
avocadoDataSet['Month'] = avocadoDataSet['Month'].map(month)
print(avocadoDataSet.head())

         Date  AveragePrice  Total Volume     4046       4225    4770  \
0  2015-12-27          1.33      64236.62  1036.74   54454.85   48.16   
1  2015-12-20          1.35      54876.98   674.28   44638.81   58.33   
2  2015-12-13          0.93     118220.22   794.70  109149.67  130.50   
3  2015-12-06          1.08      78992.15  1132.00   71976.41   72.58   
4  2015-11-29          1.28      51039.60   941.48   43838.39   75.78   

   Total Bags  Small Bags  Large Bags  XLarge Bags          type  year  \
0     8696.87     8603.62       93.25          0.0  conventional  2015   
1     9505.56     9408.07       97.49          0.0  conventional  2015   
2     8145.35     8042.21      103.14          0.0  conventional  2015   
3     5811.16     5677.40      133.76          0.0  conventional  2015   
4     6183.95     5986.26      197.69          0.0  conventional  2015   

   region Month  
0  Albany   DEC  
1  Albany   DEC  
2  Albany   DEC  
3  Albany   DEC  
4  Albany   NOV  
