NLP 1

In [None]:
import nltk
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer, MWETokenizer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')

# Define a sample sentence
sentence = "His eyes were dancing with humor , doesn't it."

# Whitespace tokenization
tokens_ws = sentence.split()
print("Whitespace Tokenization: ", tokens_ws)

# Punctuation-based tokenization
tokens_pb = word_tokenize(sentence)
print("Punctuation-Based Tokenization: ", tokens_pb)

# Treebank tokenization
tokens_tb = nltk.tokenize.TreebankWordTokenizer().tokenize(sentence)
print("Treebank Tokenization: ", tokens_tb)

# Tweet tokenization
tokens_tw = TweetTokenizer().tokenize(sentence)
print("Tweet Tokenization: ", tokens_tw)

# MWE tokenization
tokenizer = MWETokenizer([('His', 'eyes'), ('cats', 'and', 'dogs')])
tokens_mwe = tokenizer.tokenize(tokens_pb)
print("MWE Tokenization: ", tokens_mwe)

# Porter stemming
ps = PorterStemmer()
stemmed_ps = [ps.stem(token) for token in tokens_pb]
print("Porter Stemming: ", stemmed_ps)

# Snowball stemming
ss = SnowballStemmer('english')
stemmed_ss = [ss.stem(token) for token in tokens_pb]
print("Snowball Stemming: ", stemmed_ss)

# Lemmatization
wnl = WordNetLemmatizer()
lemmatized = [wnl.lemmatize(token) for token in tokens_pb]
print("Lemmatization: ", lemmatized)


Whitespace Tokenization:  ['His', 'eyes', 'were', 'dancing', 'with', 'humor', ',', "doesn't", 'it.']
Punctuation-Based Tokenization:  ['His', 'eyes', 'were', 'dancing', 'with', 'humor', ',', 'does', "n't", 'it', '.']
Treebank Tokenization:  ['His', 'eyes', 'were', 'dancing', 'with', 'humor', ',', 'does', "n't", 'it', '.']
Tweet Tokenization:  ['His', 'eyes', 'were', 'dancing', 'with', 'humor', ',', "doesn't", 'it', '.']
MWE Tokenization:  ['His_eyes', 'were', 'dancing', 'with', 'humor', ',', 'does', "n't", 'it', '.']
Porter Stemming:  ['hi', 'eye', 'were', 'danc', 'with', 'humor', ',', 'doe', "n't", 'it', '.']
Snowball Stemming:  ['his', 'eye', 'were', 'danc', 'with', 'humor', ',', 'doe', "n't", 'it', '.']
Lemmatization:  ['His', 'eye', 'were', 'dancing', 'with', 'humor', ',', 'doe', "n't", 'it', '.']


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


NLP 3

In [None]:

#Importing Libraries

import pickle
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import numpy as np

In [None]:

#Accessing document uploaded 

path_df = "/content/News_dataset.pickle"

with open(path_df, 'rb') as data:
    df = pickle.load(data)

In [None]:

#checking data

df.head()

Unnamed: 0,File_Name,Content,Category,Complete_Filename,id,News_length
0,001.txt,Ad sales boost Time Warner profit\r\n\r\nQuart...,business,001.txt-business,1,2569
1,002.txt,Dollar gains on Greenspan speech\r\n\r\nThe do...,business,002.txt-business,1,2257
2,003.txt,Yukos unit buyer faces loan claim\r\n\r\nThe o...,business,003.txt-business,1,1557
3,004.txt,High fuel prices hit BA's profits\r\n\r\nBriti...,business,004.txt-business,1,2421
4,005.txt,Pernod takeover talk lifts Domecq\r\n\r\nShare...,business,005.txt-business,1,1575


In [None]:
#Chcking article

df.loc[1]['Content']

'Dollar gains on Greenspan speech\r\n\r\nThe dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.\r\n\r\nAnd Alan Greenspan highlighted the US government\'s willingness to curb spending and rising household savings as factors which may help to reduce it. In late trading in New York, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday. Market concerns about the deficit has hit the greenback in recent months. On Friday, Federal Reserve chairman Mr Greenspan\'s speech in London ahead of the meeting of G7 finance ministers sent the dollar higher after it had earlier tumbled on the back of worse-than-expected US jobs data. "I think the chairman\'s taking a much more sanguine view on the current account deficit than he\'s taken for some time," said Robert Sinche, head of currency strategy at Bank of America in New York. "He\'s taking a longer-term view, laying out a set of cond

In [None]:
#Text cleaning

df['Content_Parsed_1'] = df['Content'].str.replace("\r", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\n", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("    ", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace('"', '')

In [None]:
#Text preparation

df['Content_Parsed_2'] = df['Content_Parsed_1'].str.lower()         #all to lower case

punctuation_signs = list("?:!.,;")                                  #remove punctuations
df['Content_Parsed_3'] = df['Content_Parsed_2']

for punct_sign in punctuation_signs:
    df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(punct_sign, '')

df['Content_Parsed_4'] = df['Content_Parsed_3'].str.replace("'s", "")       #remove possessive pronouns

  df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(punct_sign, '')


In [None]:


#Stemming and Lemmatization

nltk.download('punkt')
nltk.download('wordnet')

nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:

#Stemming and Lemmatization

wordnet_lemmatizer = WordNetLemmatizer()
nrows = len(df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['Content_Parsed_4']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

df['Content_Parsed_5'] = lemmatized_text_list

In [None]:
df['Content_Parsed_5']

0       ad sales boost time warner profit quarterly pr...
1       dollar gain on greenspan speech the dollar hav...
2       yukos unit buyer face loan claim the owners of...
3       high fuel price hit ba profit british airways ...
4       pernod takeover talk lift domecq share in uk d...
                              ...                        
2220    bt program to beat dialler scam bt be introduc...
2221    spam e-mail tempt net shoppers computer users ...
2222    be careful how you code a new european directi...
2223    us cyber security chief resign the man make su...
2224    lose yourself in online game online role play ...
Name: Content_Parsed_5, Length: 2225, dtype: object

In [None]:
#Downloading

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:

#Removing stop words

stop_words = list(stopwords.words('english'))

In [None]:
df['Content_Parsed_6'] = df['Content_Parsed_5']

for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')

  df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')


In [None]:

df.loc[5]['Content_Parsed_6']

'japan narrowly escape recession japan economy teeter   brink   technical recession   three months  september figure show revise figure indicate growth   01% -   similar-sized contraction   previous quarter   annual basis  data suggest annual growth   02% suggest  much  hesitant recovery   previously  think  common technical definition   recession  two successive quarter  negative growth  government  keen  play   worry implications   data  maintain  view  japan economy remain   minor adjustment phase   upward climb    monitor developments carefully say economy minister heizo takenaka    face   strengthen yen make export less competitive  indications  weaken economic condition ahead observers  less sanguine  paint  picture   recovery much patchier  previously think say paul sheard economist  lehman brothers  tokyo improvements   job market apparently  yet  fee   domestic demand  private consumption   02%   third quarter'

In [None]:
#Checking data

df.head(1)

Unnamed: 0,File_Name,Content,Category,Complete_Filename,id,News_length,Content_Parsed_1,Content_Parsed_2,Content_Parsed_3,Content_Parsed_4,Content_Parsed_5,Content_Parsed_6
0,001.txt,Ad sales boost Time Warner profit\r\n\r\nQuart...,business,001.txt-business,1,2569,Ad sales boost Time Warner profit Quarterly pr...,ad sales boost time warner profit quarterly pr...,ad sales boost time warner profit quarterly pr...,ad sales boost time warner profit quarterly pr...,ad sales boost time warner profit quarterly pr...,ad sales boost time warner profit quarterly pr...


In [None]:

#Removing the old content_parsed columns

list_columns = ["File_Name", "Category", "Complete_Filename", "Content", "Content_Parsed_6"]
df = df[list_columns]

df = df.rename(columns={'Content_Parsed_6': 'Content_Parsed'})

In [None]:

df.head()
     

Unnamed: 0,File_Name,Category,Complete_Filename,Content,Content_Parsed
0,001.txt,business,001.txt-business,Ad sales boost Time Warner profit\r\n\r\nQuart...,ad sales boost time warner profit quarterly pr...
1,002.txt,business,002.txt-business,Dollar gains on Greenspan speech\r\n\r\nThe do...,dollar gain greenspan speech dollar hit hi...
2,003.txt,business,003.txt-business,Yukos unit buyer faces loan claim\r\n\r\nThe o...,yukos unit buyer face loan claim owners emba...
3,004.txt,business,004.txt-business,High fuel prices hit BA's profits\r\n\r\nBriti...,high fuel price hit ba profit british airways ...
4,005.txt,business,005.txt-business,Pernod takeover talk lifts Domecq\r\n\r\nShare...,pernod takeover talk lift domecq share uk dri...


In [None]:

#Generating new column for Category codes

category_codes = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'tech': 4
}

# Category mapping
df['Category_Code'] = df['Category']
df = df.replace({'Category_Code':category_codes})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Content_Parsed'], 
                                                    df['Category_Code'], 
                                                    test_size=0.15, 
                                                    random_state=8)

In [None]:

# Parameter election
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

In [None]:
  tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)
     

(1891, 300)
(334, 300)


In [None]:
from sklearn.feature_selection import chi2
import numpy as np

for Product, category_id in sorted(category_codes.items()):
    features_chi2 = chi2(features_train, labels_train == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names_out())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

# 'business' category:
  . Most correlated unigrams:
. market
. price
. economy
. growth
. bank
  . Most correlated bigrams:
. last year
. year old

# 'entertainment' category:
  . Most correlated unigrams:
. tv
. music
. star
. award
. film
  . Most correlated bigrams:
. mr blair
. prime minister

# 'politics' category:
  . Most correlated unigrams:
. minister
. blair
. party
. election
. labour
  . Most correlated bigrams:
. prime minister
. mr blair

# 'sport' category:
  . Most correlated unigrams:
. win
. side
. game
. team
. match
  . Most correlated bigrams:
. say mr
. year old

# 'tech' category:
  . Most correlated unigrams:
. digital
. technology
. computer
. software
. users
  . Most correlated bigrams:
. year old
. say mr



In [None]:

bigrams

['tell bbc', 'last year', 'prime minister', 'mr blair', 'year old', 'say mr']

In [None]:
# X_train
with open('/content/drive/My Drive/Pickles/X_train.pickle', 'wb') as output:
    pickle.dump(X_train, output)
    
# X_test    
with open('/content/drive/My Drive/Pickles/X_test.pickle', 'wb') as output:
    pickle.dump(X_test, output)
    
# y_train
with open('/content/drive/My Drive/Pickles/y_train.pickle', 'wb') as output:
    pickle.dump(y_train, output)
    
# y_test
with open('/content/drive/My Drive/Pickles/y_test.pickle', 'wb') as output:
    pickle.dump(y_test, output)
    
# df
with open('/content/drive/My Drive/Pickles/df.pickle', 'wb') as output:
    pickle.dump(df, output)
    
# features_train
with open('/content/drive/My Drive/Pickles/features_train.pickle', 'wb') as output:
    pickle.dump(features_train, output)

# labels_train
with open('/content/drive/My Drive/Pickles/labels_train.pickle', 'wb') as output:
    pickle.dump(labels_train, output)

# features_test
with open('/content/drive/My Drive/Pickles/features_test.pickle', 'wb') as output:
    pickle.dump(features_test, output)

# labels_test
with open('/content/drive/My Drive/Pickles/labels_test.pickle', 'wb') as output:
    pickle.dump(labels_test, output)
    
# TF-IDF object
with open('/content/drive/My Drive/Pickles/tfidf.pickle', 'wb') as output:
    pickle.dump(tfidf, output)
     

FileNotFoundError: ignored

NLP 2

In [None]:
import pandas as pd #Data Manipulation and Analysis
import matplotlib.pyplot as plt #plotting

 
import numpy as np #for working with arrays 
import seaborn as sns #interactive data visulaization base on matplotlib
import warnings
warnings.filterwarnings('ignore')
import re
from time import time #Timing our operations
import collections 
from collections import defaultdict
import spacy #spaCy is a free, open-source library for NLP in Python.
from gensim.models import Word2Vec #NLP functionality 
import logging
logging.basicConfig(format = "%(levelname)s - %(asctime)s: %(message)s",datefmt = '%H:%M:%S', level=logging.INFO)
from sklearn.manifold import TSNE #tool to visualize high dimensional data
from numpy import dot #dotproduct
from numpy.linalg import norm #linear algebra ...matrix norms

In [None]:
#Data Set Import 
df = pd.read_csv('/content/data.csv')
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [None]:
print('Shape of initial dataset:', df.shape)

Shape of initial dataset: (11914, 16)


In [None]:
#New column for combined make and model is created
df['Maker_Model'] = df['Make']+" "+df['Model']

In [None]:
print(df.shape)
df.head()

(11914, 17)


Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP,Maker_Model
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135,BMW 1 Series M
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650,BMW 1 Series
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350,BMW 1 Series
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450,BMW 1 Series
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500,BMW 1 Series


In [None]:
#All the columns containing text are chosen and put in a new dataframe df1
df1 = df[['Engine Fuel Type','Transmission Type','Driven_Wheels','Market Category','Vehicle Size','Vehicle Style', 'Maker_Model']]
print(df1.shape)
df1.head()

(11914, 7)


Unnamed: 0,Engine Fuel Type,Transmission Type,Driven_Wheels,Market Category,Vehicle Size,Vehicle Style,Maker_Model
0,premium unleaded (required),MANUAL,rear wheel drive,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,BMW 1 Series M
1,premium unleaded (required),MANUAL,rear wheel drive,"Luxury,Performance",Compact,Convertible,BMW 1 Series
2,premium unleaded (required),MANUAL,rear wheel drive,"Luxury,High-Performance",Compact,Coupe,BMW 1 Series
3,premium unleaded (required),MANUAL,rear wheel drive,"Luxury,Performance",Compact,Coupe,BMW 1 Series
4,premium unleaded (required),MANUAL,rear wheel drive,Luxury,Compact,Convertible,BMW 1 Series


In [None]:
#All the columns combined into one column in df2 dataframe
df2 = df1.apply(lambda x: ','.join(x.astype(str)),axis = 1)
print(df2.shape)
df2.head()

(11914,)


0    premium unleaded (required),MANUAL,rear wheel ...
1    premium unleaded (required),MANUAL,rear wheel ...
2    premium unleaded (required),MANUAL,rear wheel ...
3    premium unleaded (required),MANUAL,rear wheel ...
4    premium unleaded (required),MANUAL,rear wheel ...
dtype: object

In [None]:
#a new pandas dataframe is created of name df_clean containing column clean
df_clean = pd.DataFrame({'clean':df2}) 
df_clean.head()

Unnamed: 0,clean
0,"premium unleaded (required),MANUAL,rear wheel ..."
1,"premium unleaded (required),MANUAL,rear wheel ..."
2,"premium unleaded (required),MANUAL,rear wheel ..."
3,"premium unleaded (required),MANUAL,rear wheel ..."
4,"premium unleaded (required),MANUAL,rear wheel ..."


In [None]:
df_clean.shape

(11914, 1)

In [None]:
#List of list data corpus for Gensim modelling
sent = [row.split(',') for row in df_clean['clean']]
sent[:2]

[['premium unleaded (required)',
  'MANUAL',
  'rear wheel drive',
  'Factory Tuner',
  'Luxury',
  'High-Performance',
  'Compact',
  'Coupe',
  'BMW 1 Series M'],
 ['premium unleaded (required)',
  'MANUAL',
  'rear wheel drive',
  'Luxury',
  'Performance',
  'Compact',
  'Convertible',
  'BMW 1 Series']]

In [None]:
model = Word2Vec(sent,min_count =1,size=50, workers = 3,window=3,sg=1)




In [None]:
model.wv['Toyota Camry']

array([-0.33180067,  0.01650393, -0.04329881, -0.11241703,  0.07930288,
        0.18482944, -0.2056361 , -0.05768769, -0.11798818,  0.11985294,
       -0.12829064, -0.02519825,  0.15462549, -0.06983617, -0.00581656,
       -0.11208623,  0.27765584, -0.09211524,  0.08034034,  0.08573758,
        0.10593732,  0.02578118,  0.01473022, -0.11561292,  0.01135474,
        0.15151139, -0.03319553, -0.01059608, -0.17220801, -0.00065012,
       -0.07801175,  0.06413625,  0.0474244 , -0.1657386 ,  0.09889247,
       -0.26238126, -0.00902424, -0.02241157, -0.2192509 ,  0.08897874,
       -0.17385125,  0.08624125, -0.09840593,  0.13556433, -0.14408243,
        0.18930626,  0.14544302,  0.13500395, -0.10921054, -0.10471402],
      dtype=float32)