In [None]:
# Let's say b is a binary number.  In python, we have to store binary numbers as strings.
# If we try to enter it directly as b = 10, Python will assume it's a base 10 integer.
b = "10"
c = '100'

# Now, we can convert b from a string to a binary number with the int function. We'll need to set the optional second argument, base, to 2 (binary is base two).
print(int(b, 2))
base_10_100 = int(c,2)

###  Binary addition
We can add binary numbers together, just like we can with base 10 numbers.

In [None]:
# a is in base 10 -- because we have 10 possible digits, the highest value we can represent with one digit is 9.
a = 9

# When we want to represent a value one higher, we need to add another digit.
a += 1
# a now has two digits -- we incremented the invisible leading digit, which was 0 and is now 1, and set the last digit back to zero.
print(a)

# When we add 1 to 19, we increment the leading 1 by 1, and then set the last digit to 0, giving us 20.
a = 19
a += 1

# When we add 1 to 99, we increment the last digit by 1, and add 1 to the first digit, but the first digit is now greater than 9, so we have to increment the invisible leading digit.
a = 99
a += 1

# Binary addition works the exact same way, except the highest value any single digit can represent is 1.
b = "1"

# We'll add binary values using a binary_add function that was made just for this exercise.
# It's not extremely important to know how it works right this second.
def binary_add(a, b):
    print(a, int(a, 2), int(b, 2))
    return bin(int(a, 2) + int(b, 2))[2:]

c = binary_add(b, "1")

### Converting Binary Values to Other Bases

Let's see which values in binary equal which values in base 10.

In [None]:
#Convert "1001" to base 10, and assign the result to base_10_1001.
def binary_add(a, b):
    return bin(int(a, 2) + int(b, 2))[2:] 
## The bin function adds "0b" to the beginning of a string to indicate that it contains binary values.

# Start both at 0
a = 0
b = "0"

# Loop 10 times
for i in range(0, 10):
    # Add 1 to each
    a += 1
    b = binary_add(b, "1")

    # Check if they are equal
    print(int(b, 2) == a)

# The cool thing here is that a and b are always equal if we add the same amount to both.
# This is because base 2 and base 10 are just ways to write numbers.
# Counting 100 apples in base 2 or base 10 will always give us an equivalent result - we just have to convert between them.
# We can represent any number in binary; we just need to use more digits than we would in base 10.
base_10_1001 = int("1001",2)

### Converting Characters to Binary
Computers store strings in binary, just like they do with integers. First, they split them into single characters, then convert those characters to integers. Finally, they convert those integers to binary and store them.

In [None]:
# We can use the ord() function to get the integer for an ASCII character.
ord('a')

# Then, we use the bin() function to convert to binary.
# The bin function adds "0b" to the beginning of a string to indicate that it contains binary values.
bin(ord('a'))

# ÿ is the "last" ASCII character; it has the highest integer value of any ASCII character.
# This is because 255 is the highest value we can represent with eight binary digits.
ord('ÿ')
# As you can see, we get eight 1's, which shows that this is the highest possible eight-digit value.
bin(ord('ÿ'))
print(bin(ord('ÿ')))

# There are 256 different ASCII symbols, because the largest amount of storage any single ASCII character can take up is one byte.
binary_w = bin(ord('w'))
binary_bracket = bin(ord('}'))

###  Introduction to Unicode


In [None]:
#Find the binary representation of "\u1019" and assign it to binary_1019.
# We can initialize Unicode code points (the value for this code point is \u27F6, but you see it as a character here because the Dataquest system is automatically converting it).
code_point = "⟶"

# This particular code point maps to a right arrow character.
print(code_point)

# We can get the base 10 integer value of the code point with the ord function.
print(ord(code_point))

# As you can see, this takes up a lot more than 1 byte.
print(bin(ord(code_point)))

binary_1019 = bin(ord("\u1019"))
print("\u1019")

###  7. Strings with Unicode
ASCII is a subset of Unicode. Unicode implements all of the ASCII characters, as well as the additional characters that code points allow.

In [None]:
#Make a string that combines Unicode and ASCII, and assign it to s3.
s1 = "café"
# The \u prefix means "the next four digits are a Unicode code point"
# It doesn't change the value at all (the last character in the string below is \u00e9)
s2 = "café"

print(ord("é"), " \u00e9")
# These strings are the same, because code points are equal to their corresponding Unicode characters.
# \u00e9 and é are equivalent.
print(s1 == s2)

s3 = "hi $"

###  The Bytes Data Type
Python includes a data type called "bytes." It's similar to a string, except that it contains encoded bytes values.

When we create an object with a bytes type from a string, we specify an encoding system (usually UTF-8).

Then, we can use the .encode() method to encode the string into bytes.

In [None]:
#Encode batman in UTF-8 and assign it to batman_bytes
# We can make a string with some Unicode values
superman = "Clark Kent␦"
print(superman)

# This tells Python to encode the string superman as Unicode using the UTF-8 encoding system
# We end up with a sequence of bytes instead of a string
superman_bytes = "Clark Kent␦".encode("utf-8")

batman = "Bruce Wayne␦"

batman_bytes = batman.encode("utf-8")

### Hexadecimal Coversions

In [1]:
# F is the highest single digit in hexadecimal (base 16)
# Its value is 15 in base 10
print(int("F", 16))

# A in base 16 has the value 10 in base 10
print(int("A", 16))

# Just like the earlier binary_add function, this adds two hexadecimal numbers
def hexadecimal_add(a, b):
    return hex(int(a, 16) + int(b, 16))[2:]

# When we add 1 to 9 in hexadecimal, it becomes "a"
value = "9"
value = hexadecimal_add(value, "1")
print(value)

hex_ea = hexadecimal_add("ea", "2")
hex_ef = hexadecimal_add("e", "f")

15
10
a


### Hex to Binary
We can convert hexadecimal to binary fairly easily. We can even use the ord() and bin() functions that helped us convert code points to binary.

In [None]:
# One byte (eight bits) in hexadecimal (the value of the byte below is \xe2)
hex_byte = "â"

# Print the base 10 integer value for the hexadecimal byte
print(ord(hex_byte))

# This gives the exact same value. Remember that \x is just a prefix, and doesn't affect the value.
print(int("e2", 16))

# Convert the base 10 integer to binary
print(bin(ord("â")))

binary_aa = bin(int("aa",16))
binary_ab = bin(ord("\xab"))

### Bytes and Strings
There's no encoding system associated with the bytes data type. That means if we have an object with that data type, Python won't know how to display the (encoded) code points in it. For this reason, we can't mix bytes objects and strings together.


In [None]:
hulk_bytes = "Bruce Banner␦".encode("utf-8")

# We can't mix strings and bytes
# For instance, if we try to replace the Unicode ␦ character as a string, it won't work, because that value has been encoded to bytes
try:
    hulk_bytes.replace("Banner", "")
except Exception:
    print("TypeError with replacement")

# We can create objects of the bytes data type by putting a b in front of the quotation marks in a string
hulk_bytes = b"Bruce Banner"
# Now, instead of mixing strings and bytes, we can use the replace method with bytes objects instead
hulk_bytes.replace(b"Banner", b"")

thor_bytes = b"Thor"

### Decode Bytes and Strings
Once we have a bytes object, we can decode it into a string using an encoding system. We use the .decode() method to do this.

In [None]:
# Make a bytes object with aquaman's secret identity
aquaman_bytes = b"Who knows?"

# Now, we can use the decode method, along with the encoding system (UTF-8) to turn it into a string
aquaman = aquaman_bytes.decode("utf-8")

# We can print the value and type to verify that it's a string
print(aquaman)
print(type(aquaman))

morgan_freeman_bytes = b"Morgan Freeman"
morgan_freeman = morgan_freeman_bytes.decode("utf-8")

### Read in File Data

In [4]:
# We can read our data in using csvreader
import csv
# When we open a file, we can specify the system used to encode it (in this case, UTF-8).
f = open("sentences_cia.csv", 'r', encoding="utf-8")
csvreader = csv.reader(f)
sentences_cia = list(csvreader)

# The data consists of two columns
# The first column contains the year, and the second contains a sentence from a CIA report written in that year
# Print the first column of the second row
print(sentences_cia[1][0])

# Print the second column of the second row
print(sentences_cia[1][1])

sentences_ten = sentences_cia[9][1]

1997
The FBI information included that al-Mairi's brother "traveled to Afghanistan in 1997-1998 to train in Bin - Ladencamps."


### Convert to DataFrame


In [5]:
import csv
# Let's read in the legislators data from a few missions ago
f = open("legislators.csv", 'r', encoding="utf-8")
csvreader = csv.reader(f)
legislators = list(csvreader)

# Now, we can import pandas and use the DataFrame class to convert the list of lists to a dataframe.
import pandas as pd

legislators_df = pd.DataFrame(legislators)

# As you can see, the first row contains the headers, which we don't want (because they're not actually data)
print(legislators_df.iloc[0,:])

# To remove the headers, we'll subset the df and pass them in separately
# This code removes the headers from legislators, and instead passes them into the columns argument
# The columns argument specifies column names
legislators_df = pd.DataFrame(legislators[1:], columns=legislators[0])
# We now have the right data in the first row, as well as the proper headers
print(legislators_df.iloc[0,:])

print(sentences_cia[0][1])
# The sentences_cia data from the last screen is available.
sentences_cia = pd.DataFrame(sentences_cia[1:], columns = sentences_cia[0])

0                 title
1             firstname
2            middlename
3              lastname
4           name_suffix
5              nickname
6                 party
7                 state
8              district
9             in_office
10               gender
11                phone
12                  fax
13              website
14              webform
15      congress_office
16          bioguide_id
17         votesmart_id
18               fec_id
19          govtrack_id
20               crp_id
21           twitter_id
22    congresspedia_url
23          youtube_url
24          facebook_id
25         official_rss
26         senate_class
27            birthdate
28             oc_email
Name: 0, dtype: object
title                                                              Rep
firstname                                                         Neil
middlename                                                            
lastname                                                   Abercromb

### Clean up Sentences


In [7]:
# The integer codes for all the characters we want to keep
good_characters = [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 32]

sentence_15 = sentences_cia["statement"][14]

# Iterate over the characters in the sentence, and only take those whose integer representations are in good_characters
# This will construct a list of single characters
cleaned_sentence_15_list = [s for s in sentence_15 if ord(s) in  good_characters]

# Join the list together, separated by "" (no space), which creates a string again
cleaned_sentence_15 = "".join(cleaned_sentence_15_list)

#Make a function that takes a dataframe row and then returns the clean version of the "statement" column
def clean_statement(df_row):
    clean_statement = [s for s in df_row['statement'] if ord(s) in good_characters]
    return "".join(clean_statement)

sentences_cia['cleaned_statement'] = sentences_cia.apply(clean_statement, axis=1)
sentences_cia.head()


Unnamed: 0,year,statement,Unnamed: 3,Unnamed: 4,Unnamed: 5,cleaned_statement
0,1997,The FBI information included that al-Mairi's b...,,,,The FBI information included that alMairis bro...
1,1997,The FBI information included that al-Mairi's b...,,,,The FBI information included that alMairis bro...
2,1997,"For example, on October 12, 2004, another CIA ...",,,,For example on October 12 2004 another CIA det...
3,1997,"On October 16, 2001, an email from a CTC offic...",,,,On October 16 2001 an email from a CTC officer...
4,1997,"For example, on October 12, 2004, another CIA ...",,,,For example on October 12 2004 another CIA det...


### Tokenize statements


In [8]:
# We can use the .join() method on strings to join lists together.
# The string we use the method on will become the separator -- the character(s) between each string when they are joined..
combined_statements = " ".join(sentences_cia["cleaned_statement"])

st = "I am different"
st1 = "I am Tapasi"
st = " ".join([st, st1])
st2 = st.split()
st3 = st.split(" ")

splited_statement = combined_statements.split()

statement_tokens = combined_statements.split(" ")
# #count the repeated words
# from collections import Counter
# statement_tokens = Counter(splited_statement)

### Filter the tokens


In [10]:
# statement_tokens has been loaded in.
# filtered_tokens = []
# for word in statement_tokens:
#     if len(word) >= 5:
#         filtered_tokens.append(word)
    
    
filtered_tokens = [w for w in statement_tokens if len(w) >=5]

#[s for s in df_row['statement'] if ord(s) in good_characters]

### Count the tokens


In [11]:
from collections import Counter
fruits = ["apple", "apple", "banana", "orange", "pear", "orange", "apple", "grape"]
fruit_count = Counter(fruits)

# Our code has counted each of the items in the list, and given them dictionary keys
print(fruit_count)

#count the repeated words
from collections import Counter
filtered_token_counts = Counter(filtered_tokens)

Counter({'apple': 3, 'orange': 2, 'banana': 1, 'pear': 1, 'grape': 1})


### Most common tokens


In [None]:
from collections import Counter
fruits = ["apple", "apple", "banana", "orange", "pear", "orange", "apple", "grape"]
fruit_count = Counter(fruits)
print(fruit_count)
# We can use the most_common method of a Counter class to get the most common items
# We pass in a number, which is the number of items we want to get
print(fruit_count.most_common(2))
print(fruit_count.most_common(3))

# filtered_token_counts has been loaded in
common_tokens = filtered_token_counts.most_common(3)

### Finding the most common token by year


In [None]:
# sentences_cia has been loaded in.
# It already has the cleaned_statement column.
def most_common_in_year(df, yr):
    #print(yr, df['year'])
    #if the given year matches the df year
    sub_set = df[df['year'] == yr]
    
    combined_statement = " ".join(sub_set['cleaned_statement'])
    #print(combined_statement)
        
    #print("final : ", combined_statement)
    
    #split
    splited_statement = combined_statement.split(" ")

    #filter with >= 5 words
    filtered_statement = [w for w in splited_statement if len(w) >= 5]
    
    #use the counter to count the repeated words
    common_words = Counter(filtered_statement)

    #return the most tqo common words
    return common_words.most_common(2)


common_2000 = most_common_in_year(sentences_cia,'2000') 
common_2002 = most_common_in_year(sentences_cia,'2002') 
common_2013 = most_common_in_year(sentences_cia,'2013') 