# Evolution of the Modern Smartphone - Visualization Creation

This notebook shows how I created my visualizations for my I590 Data Visualization Final Presentation. The spreadsheet that I am using for data can be found at: https://docs.google.com/spreadsheets/d/1PeJFSLYb6GOZtUfwjJ-XD7rE4KGexA-dVoUfRHnSngA/edit?usp=sharing and has been sourced from GSMArena, Wikipedia, WikiChip and Geekbench.

## First load our libraries

In [8]:
import pandas as pd
pd.options.display.max_columns = 500
import matplotlib.pyplot as plt
import glob
import cv2
import time

pandas - for reading our data from the excel sheet and storing it <br/>
matplotlib - for creating our graphs and saving them <br/>
glob - for reading all the graphs that we have saved as images <br/>
cv2 - for creating the video output <br/>
time - for keeping track of how long it takes to run each time <br/>

## Load our data

In [3]:
df = pd.read_excel('phones.xlsx')
df = df.sort_values(by=['Release Date'])
df.columns = df.columns.str.replace(' ', '')
df.head()

Unnamed: 0,Name,Brand,ScreenSize,ResolutionWidth,ResolutionHeight,BackCamera,FrontCamera,Memory,CPU,GPU,Battery,ReleaseDate,h,w,d,weight,s2b,cardslot,storagemin,storagemax,VideoResolution,VideoFrameRate,aperature,sensorsize,headphonejack,bluetooth,wirelesscharging,wirelesschargingspeed,processsize,corecount,clockspeed,GB4S,GB4M,GB5S,GB5M,Notes
0,iPhone,Apple,3.5,320,480,2.0,0.0,128,S5L8900,PowerVR MBX Lite 3D,1400,2007-06-29,115.0,61.0,11.6,135.0,52.0,0,4,16,0,0,0.0,0.0,0*,2.0,0.0,0.0,90,1,412,0,0,0,0,
1,iPhone 3G,Apple,3.5,320,480,2.0,0.0,128,S5L8900,PowerVR MBX Lite 3D,1150,2008-07-11,115.5,62.1,12.3,133.0,50.9,0,8,16,0,0,0.0,0.0,1,2.0,0.0,0.0,90,1,412,0,0,0,0,
2,iPhone 3GS,Apple,3.5,320,480,3.15,0.0,256,S5PC100,PowerVR SGX535,1220,2009-06-19,115.5,62.1,12.3,135.0,50.9,0,8,32,480,30,2.8,0.0,1,2.1,0.0,0.0,65,1,600,0,0,0,0,
3,Galaxy S,Samsung,4.0,480,800,5.0,2.0,512,Exynos 3,PowerVR SGX 540,1500,2010-06-04,122.4,64.2,12.45,119.0,66.8,1,8,16,720,30,0.0,0.0,1,3.0,0.0,0.0,45,1,1000,0,0,0,0,
4,iPhone 4,Apple,3.5,640,960,5.0,2.0,512,A4,PowerVR SGX535,1420,2010-06-24,115.2,58.6,9.3,137.0,54.0,0,8,32,720,30,3.2,3.2,1,2.1,0.0,0.0,45,1,800,0,0,0,0,


First we load our data from the excel sheet we have downloaded from Google Sheets. Then we sort by the release data, do some slight cleaning of the column names and show a preview of our data

## Create our functions

In [4]:
# used to create labels on the graph
def get_units(category):
    if(category == 'ScreenSize'):
        return("Screen Size (diagonal) in inches")
    if(category == 'ResolutionWidth'):
        return("Screen Resolution (width) in pixels")
    if(category == 'ResolutionHeight'):
        return("Screen Resolution (height) in pixels")
    if(category == 'BackCamera'):
        return("Rear Camera Resolution (megapixels)")
    if(category == 'FrontCamera'):
        return("Front Camera Resolution (megapixels)")
    if(category == 'Memory'):
        return("Phone Memory (RAM in Megabytes)")
    if(category == 'Battery'):
        return("Battery Capacity (milliamp hours)")
    if(category == 'weight'):
        return("Phone Weight (grams)")
    if(category == 's2b'):
        return("Phone Screen to Body Ratio (percentage)")
    if(category == 'storagemin'):
        return("Phone Storage minimum (Gigabytes)")
    if(category == 'storagemax'):
        return("Phone Storage maximum (Gigabytes)")
    if(category == 'corecount'):
        return("Phone Core Count")
    if(category == 'clockspeed'):
        return("Phone maximum clockspeed (megahertz)")
    if(category == 'GB4S'):
        return("Geekbench 4 Single Core Score")
    if(category == 'GB4M'):
        return("Geekbench 4 Multi Core Score")
    if(category == 'GB5S'):
        return("Geekbench 5 Single Core Score")
    if(category == 'GB5M'):
        return("Geekbench 5 Multi Core Score")

In [6]:
def graph_from_list(df, to_graph, num_to_display=10):
    last_len = 0
    # get the current timestamp for seeing how long it took later
    ts = time.time()
    colors = []
    # iterate through our list of categories to graph
    for g in to_graph:
        # get a timestamp for each category
        tts = time.time()
        # iterate through years, months and days since 2007
        for y in range(2007, 2020):
            for m in range(1,13):
                for d in range(1,31):
                    # keep datetime from breaking due to Feb only having 28/29 days
                    if(m == 2 and d > 28):
                        t = 0
                    else:
                        # create a datetime string for sorting and sort by releases
                        # released up to the date created. Only return values that
                        # have been created up to that point
                        dt = str(y) + '-' + str(m) + '-' + str(d)
                        time_df = df[df['ReleaseDate'] < dt].sort_values(by=[g, 'ReleaseDate'])
                        # create a list of data for our category, along with device names
                        # and manufacturers
                        data = time_df[g].to_list()
                        names = time_df['Name'].to_list()
                        mfgs = time_df['Brand'].to_list()
                        
                        # redraw the manufacturer colors if there has been a new
                        # device added, saves time over doing each loop
                        if(len(data) > last_len):
                            colors = []
                            # use colors that are roughly associated with the brands
                            for mfg in mfgs:
                                if(mfg == 'Apple'):
                                    colors.append("#ababab")
                                if(mfg == 'Samsung'):
                                    colors.append('#3e70fa')
                                if(mfg == 'Google'):
                                    colors.append('#3cba54')
                                elif(mfg != 'Apple' and mfg != 'Samsung' and mfg != 'Google'):
                                    colors.append('#996b32')
                        
                        labels = []
                        # stack labels in reverse so that the when we set the limit
                        # we get the top values and not the bottom
                        for i in range(len(data)):
                            labels.append(i * -1)
                        # plot the bars
                        plt.barh(labels,data, tick_label=names, color=colors)
                        axes = plt.gca()
                        # change the x axis to be between 0 and slightly larger than
                        # the overall max value
                        axes.set_xlim([0,(df[g].max())*1.1])
                        # change the y axis to show the top 10 devices based on how
                        # many there are
                        axes.set_ylim([len(data)*-1+num_to_display+0.5,len(data)*-1 + 0.5])
                        # set the title equal to the current day so we can keep track
                        # of the date in the visualization
                        plt.title(dt)
                        # add some padding and adjust size of plot
                        plt.gcf().subplots_adjust(left=0.1)
                        plt.gcf().set_size_inches(16,9)
                        # set xlabel based on what category we are using
                        plt.xlabel(get_units(g))
                        plt.ylabel('Device')
                        # create the path for the image to be stored based
                        # on date and category
                        path = 'images/' + g + dt +'.jpg'
                        # add labels to the end of the bars that are being shown
                        for l in range(len(data)):
                            if(labels[l] < len(data)*-1+num_to_display+0.5 and labels[l] > len(data)*-1 + 0.5):
                                plt.text(data[l], labels[l], data[l])
                        # save the figure, and clear the plot for the next one
                        plt.savefig(path, pad_inches=0.5)
                        plt.cla()
                        # keep track of how many devices we have 
                        last_len = len(data)
        # print time to complete first step
        print(tts-time.time())
        # iterate through dates again, but this time with less days
        for y in range(2007, 2020):
            for m in range(1,13):
                ds = [1,14,28]
                for d in ds:
                    # same ideas as before
                    dt = str(y) + '-' + str(m) + '-' + str(d)
                    time_df = df[df['ReleaseDate'] < dt].sort_values(by=[g, 'ReleaseDate'])
                    data = time_df[g].to_list()
                    names = time_df['Name'].to_list()
                    mfgs = time_df['Brand'].to_list()

                    
                    if(len(data) > last_len):
                        colors = []
                        for mfg in mfgs:
                            if(mfg == 'Apple'):
                                colors.append("#ababab")
                            if(mfg == 'Samsung'):
                                colors.append('#3e70fa')
                            if(mfg == 'Google'):
                                colors.append('#3cba54')
                            elif(mfg != 'Apple' and mfg != 'Samsung' and mfg != 'Google'):
                                colors.append('#996b32')

                    labels = []
                    for i in range(len(data)):
                        labels.append(i)
                    plt.barh(labels,data, tick_label=names, color=colors)
                    axes = plt.gca()
                    # set xlimit to same as above
                    axes.set_xlim([0,(df[g].max())*1.1])
                    # no setting the ylimit so that we can see all devices
                    plt.title(dt)
                    plt.gcf().subplots_adjust(left=0.1)
                    plt.gcf().set_size_inches(16,9)
                    plt.xlabel(get_units(g))
                    plt.ylabel('Device')
                    # save with different name so that they do not overlap
                    path = 'images/' + g + dt +'_recap.jpg'
                    for l in range(len(data)):
                            plt.text(data[l], labels[l], data[l])
                    plt.savefig(path, pad_inches=0.5)
                    plt.cla()
                    last_len = len(data)
        # print time to complete second step
        print(tts-time.time())
        # load the paths of all the plots we have created
        images = glob.glob('images/'+g+'*.jpg')
        # sort them by date created so they are in order
        images.sort(key=os.path.getmtime)
        # create the video out buffer
        out = cv2.VideoWriter('images/' + g + '.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 60, (1280,720))
        # add each image to the video buffer
        for image in images:
            cap = cv2.imread(image)
            cap = cv2.resize(cap, (1280,720))
            out.write(cap)
        # release the video buffer to be used after all images written
        out.release()
        # print time to complete third and final step
        print(tts-time.time())
    # print time to complete all categories
    print(ts-time.time())

In [7]:
# create our list of categories for which we list to create visualizations
to_graph = ['ScreenSize', 'ResolutionWidth', 'ResolutionHeight', 'BackCamera',
            'FrontCamera', 'Memory', 'Battery', 'weight', 's2b', 'storagemin',
            'storagemax', 'corecount', 'clockspeed', 'GB4S', 'GB4M', 'GB5S', 'GB5M']
# pass in our list of categories, data frame, and amount of devices to show
graph_from_list(df, to_graph, 10)
