In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk, string
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline
import json
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

df = pd.read_csv('data.csv')
df = df[~df['name'].str.contains("Thesis Research", na=False)] # Exclude Thesis stuff
# df = df[~df['name'].str.contains("LANG ", na=False)] # Exclude LANG courses
df.head(n=2)

Unnamed: 0.1,Unnamed: 0,dept,description,name
0,0,ACCT,"For SB&amp;M students, and programs that desig...",ACCT 2010 - Principles of Accounting I (3 units)
1,1,ACCT,Study of the application of generally accepted...,ACCT 3010 - Financial Accounting I (3 units)


In [29]:
documents = df['description'].values
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))


def tokenize(text):
    text = text.lower() # lower case
    for e in set(string.punctuation+'\n'+'\t'): # remove punctuation and line breaks/tabs
        text = text.replace(e, ' ') 
    for i in range(0,10): # remove double spaces
        text = text.replace('  ', ' ')
    text = text.translate(string.punctuation)  # punctuation
    tokens = nltk.word_tokenize(text)
    text = [w for w in tokens if not w in stopwords.words('english')] # stopwords
    stems = []
    for item in tokens: # stem
        stems.append(PorterStemmer().stem(item))
    return stems


tfidf = TfidfVectorizer(tokenizer=normalize, stop_words='english').fit_transform(documents)
print('DONE')



DONE


In [30]:
from sklearn.decomposition import TruncatedSVD
tfidf = tfidf.toarray()
model = TSNE(n_components=2, perplexity=200, verbose=2, n_iter=2000, n_iter_without_progress=300).fit_transform(tfidf)

# save to json file
x_axis=model[:,0]
y_axis=model[:,1]
x_norm = (x_axis-np.min(x_axis)) / (np.max(x_axis) - np.min(x_axis))
y_norm = (y_axis-np.min(y_axis)) / (np.max(y_axis) - np.min(y_axis))
data = {"x":x_norm.tolist(), "y":y_norm.tolist()}
with open('data.json', 'w') as outfile:
    json.dump(data, outfile)

df.to_csv('labels.csv', header=True, encoding='utf-8', columns=['name','dept'])
df['dept'].value_counts()

[t-SNE] Computing 601 nearest neighbors...
[t-SNE] Indexed 1076 samples in 0.002s...
[t-SNE] Computed neighbors for 1076 samples in 0.138s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1076
[t-SNE] Computed conditional probabilities for sample 1076 / 1076
[t-SNE] Mean sigma: 0.369687
[t-SNE] Computed conditional probabilities in 0.190s
[t-SNE] Iteration 50: error = 48.4894676, gradient norm = 0.1594132 (50 iterations in 0.213s)
[t-SNE] Iteration 100: error = 48.5281715, gradient norm = 0.1407616 (50 iterations in 0.195s)
[t-SNE] Iteration 150: error = 48.4660034, gradient norm = 0.1481871 (50 iterations in 0.192s)
[t-SNE] Iteration 200: error = 48.4805984, gradient norm = 0.1517829 (50 iterations in 0.501s)
[t-SNE] Iteration 250: error = 48.4444695, gradient norm = 0.1357981 (50 iterations in 0.463s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 48.444469
[t-SNE] Iteration 300: error = 1.0246223, gradient norm = 0.0009914 (50 iterations in 0.248s

MATH    69
COMP    56
ISOM    55
LANG    48
CIVL    44
        ..
SUST     1
HLTH     1
BIPH     1
EVNG     1
GNED     1
Name: dept, Length: 76, dtype: int64