## Importing Packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## Reading data

In [2]:
tmdbDataSet = pd.read_csv('./Cleaning data/tmdb_5000_movies.csv')

In [None]:
# Values in the dataset
print(tmdbDataSet.values)
print(type(tmdbDataSet.values))

In [None]:
# Creating dataset out of list
index = ['Name', 'Salary', 'Age']
details = [['Ashok', 'Mike', 'Arun'], [1200, 1400, 2500], [23, 28, 30]]
zippedList = list(zip(index, details))
dictObject = dict(zippedList)
df = pd.DataFrame(dictObject)
print(df)

In [None]:
# Updating column labels
list_labels = ['Age(In Years)', 'Name', 'Salary (In $)']
df.columns = list_labels
print(df)

In [None]:
# Updating column labels at time of importing dataset
tmdbDataSet_rename = pd.read_csv('tmdb_5000_movies.csv', header=0, names=['budget', 'genres', 'home_page', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'movie_status', 'tagline', 'title', 'vote_average',
       'vote_count'], comment='#')
print(tmdbDataSet_rename.head())

In [None]:
# Exporting dataset ot csv or excel file
tmdbDataSet.to_csv('tmdb_movies.csv', index=False)

tmdbDataSet.to_excel('tmdb_movies.xlsx', index=False)

In [None]:
# Plotting datasets
tmdbDataSet.plot()
plt.show()

In [None]:
# Plot columns on different charts
tmdbDataSet.plot(subplots=True)
plt.show()

In [None]:
# plot selected columns
columnList = ['vote_count','budget']
tmdbDataSet[columnList].plot(subplots=True)
plt.show()

In [None]:
tmdbDataSet.plot(x='production_companies', y=['budget'])
plt.show()

In [None]:
# Scatter and box plot
cols = ['vote_count','budget']

tmdbDataSet[cols].plot(kind='box', subplots=True)

plt.show()

tmdbDataSet.plot(kind='scatter', x='budget', y='vote_count', s=tmdbDataSet.popularity)
plt.show()

In [None]:
# Histogram chart
tmdbDataSet.plot(kind='hist', y='vote_average')
plt.show()

In [None]:
# PDF and CDF
tmdbDataSet.plot(kind='hist', y='vote_average', normed=True)
plt.show()

In [None]:
# Way of plotting two charts in one charts
figure, axes = plt.subplots(nrows=2, ncols=1)

tmdbDataSet.plot(ax=axes[0], kind='hist', y='vote_average')
tmdbDataSet.plot(ax=axes[1], kind='hist', y='vote_average', normed=True, 
cumulative=True)

plt.show()

In [None]:
tmdbDataSet_date_index = pd.read_csv('tmdb_5000_movies.csv', index_col='release_date', parse_dates=True)

print(tmdbDataSet_date_index.head())
print(tmdbDataSet_date_index.loc['2010-Aug-01'].head())
print(tmdbDataSet_date_index.loc['2010-01-01 21:00:00':'2010-05-11 22:00:00'].head())

In [None]:
print(pd.to_datetime(['2010-01-01 21:00:00','2010-05-11 22:00:00'], format='%Y-%m-%d %H:%M'))

In [None]:
tmdbDataSet.index = [x * 2 for x in range(0, 4803)] 

In [None]:
print(tmdbDataSet.index.name)
tmdbDataSet.index.name = 'movie_index'
print(tmdbDataSet.index.name)

In [None]:
tmdbDataSet = tmdbDataSet.set_index(['release_date', 'status'])

In [None]:
tmdbDataSet = tmdbDataSet.sort_index()

In [None]:
print(tmdbDataSet.loc[('1916-09-04','Released')])

In [None]:
print(tmdbDataSet.loc[(['1916-09-04', '2010-03-03'],'Released'), :])

In [None]:
tmdbDataSet.loc[(slice(None), 'Rumored'), :]

In [9]:
tmdbDataSet_dateIndex = pd.read_csv('./Cleaning Data/tmdb_5000_movies.csv', 
                                    parse_dates=True, index_col='release_date')

In [17]:
print(tmdbDataSet_dateIndex['vote_count'].resample('D').count())

release_date
1916-09-04    1
1916-09-05    0
1916-09-06    0
1916-09-07    0
1916-09-08    0
1916-09-09    0
1916-09-10    0
1916-09-11    0
1916-09-12    0
1916-09-13    0
1916-09-14    0
1916-09-15    0
1916-09-16    0
1916-09-17    0
1916-09-18    0
1916-09-19    0
1916-09-20    0
1916-09-21    0
1916-09-22    0
1916-09-23    0
1916-09-24    0
1916-09-25    0
1916-09-26    0
1916-09-27    0
1916-09-28    0
1916-09-29    0
1916-09-30    0
1916-10-01    0
1916-10-02    0
1916-10-03    0
             ..
2017-01-05    0
2017-01-06    0
2017-01-07    0
2017-01-08    0
2017-01-09    0
2017-01-10    0
2017-01-11    0
2017-01-12    0
2017-01-13    0
2017-01-14    0
2017-01-15    0
2017-01-16    0
2017-01-17    0
2017-01-18    0
2017-01-19    0
2017-01-20    0
2017-01-21    0
2017-01-22    0
2017-01-23    0
2017-01-24    0
2017-01-25    0
2017-01-26    0
2017-01-27    0
2017-01-28    0
2017-01-29    0
2017-01-30    0
2017-01-31    0
2017-02-01    0
2017-02-02    0
2017-02-03    1
Name: vote_

In [29]:
# Vote count movies got in August 2009
count = tmdbDataSet_dateIndex['vote_count']['2009-Aug']

In [32]:
print(count.resample('D').max())

release_date
2009-08-04    1962.0
2009-08-05    3382.0
2009-08-06     560.0
2009-08-07       NaN
2009-08-08       NaN
2009-08-09       6.0
2009-08-10       NaN
2009-08-11       NaN
2009-08-12       NaN
2009-08-13     626.0
2009-08-14     798.0
2009-08-15       NaN
2009-08-16       NaN
2009-08-17       NaN
2009-08-18    6430.0
2009-08-19    1262.0
2009-08-20     177.0
2009-08-21     100.0
2009-08-22       NaN
2009-08-23       NaN
2009-08-24       NaN
2009-08-25       NaN
2009-08-26     831.0
2009-08-27       NaN
2009-08-28     267.0
Freq: D, Name: vote_count, dtype: float64
