# Importing data in pandas

## Importing CSV file


This read operation loads the CSV file diabetes.csv to generate a pandas Dataframe object df. 

In [1]:
import pandas as pd

df = pd.read_csv("../data/diabetes.csv")

In [2]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Using usecols in read_csv()

Here, we are specifying only 4 columns to load and we use the header 0 as its default header.

In [3]:
df = pd.read_csv('../data/diabetes.csv',
        header=0,
        usecols=["BMI","DiabetesPedigreeFunction","Age","Outcome"])
df.head()

Unnamed: 0,BMI,DiabetesPedigreeFunction,Age,Outcome
0,33.6,0.627,50,1
1,26.6,0.351,31,0
2,23.3,0.672,32,1
3,28.1,0.167,21,0
4,43.1,2.288,33,1


### Make a Column Index in read_csv()


In [4]:
df = pd.read_csv('../data/iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
df = pd.read_csv('../data/iris.csv',
                index_col=0)
df.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


As a result, we set the “ID” column as index.

## Importing Excel files

In [6]:
df = pd.read_excel('../data/patent.xlsx')
df.head()

Unnamed: 0,1、2010年：中国专利诉讼地域分布图,Unnamed: 1
0,Court Province,Patents Quantity
1,浙江省,233
2,广东省,201
3,上海市,91
4,江苏省,84


In [7]:
df = pd.read_excel('../data/patent.xlsx',
                  header=1 )
df.head()

Unnamed: 0,Court Province,Patents Quantity
0,浙江省,233
1,广东省,201
2,上海市,91
3,江苏省,84
4,北京市,34


In [8]:
df = pd.read_excel('../data/patent.xlsx',
                   sheet_name=1,
                   header=1 )
df.head()

Unnamed: 0,Court Province,Patents Quantity
0,广东省,1694
1,北京市,1230
2,浙江省,896
3,江苏省,553
4,山东省,399


In [9]:
df = pd.read_excel('../data/patent.xlsx',
                   sheet_name=1,
                   header=1,
                  dtype={'Court Province':object, 
                         'Patents Quantity': float})
df.head()

Unnamed: 0,Court Province,Patents Quantity
0,广东省,1694.0
1,北京市,1230.0
2,浙江省,896.0
3,江苏省,553.0
4,山东省,399.0


## Import JSON files



In [10]:
students = {
         "Name":{"0":"John","1":"Nick","2":"Ali","3":"Joseph"},
         "Gender":{"0":"Male","1":"Male","2":"Female","3":"Male"},
         "Nationality":{"0":"UK","1":"French","2":"USA","3":"Brazil"},
         "Age" :{"0":10,"1":25,"2":35,"3":29}
}

In [11]:
import json
with open('../data/students.json', 'w') as f:
    json.dump(students, f)

###### pandas.read_json

In [12]:
# load pandas and json modules                                                                                               
import pandas as pd

df = pd.read_json('../data/students.json')
df

Unnamed: 0,Name,Gender,Nationality,Age
0,John,Male,UK,10
1,Nick,Male,French,25
2,Ali,Female,USA,35
3,Joseph,Male,Brazil,29


In [13]:
#iris_data = pd.read_json("https://raw.githubusercontent.com/domoritz/maps/master/data/iris.json")
#iris_data.head()

# Outputting data in pandas

## Outputting a DataFrame into a CSV file



In [14]:
df = pd.read_excel('../data/patent.xlsx',
                   sheet_name=1,
                   header=1 )
df.head()

Unnamed: 0,Court Province,Patents Quantity
0,广东省,1694
1,北京市,1230
2,浙江省,896
3,江苏省,553
4,山东省,399


In [15]:
df.to_csv("../data/patent2019.csv", index=False)

## Outputting a DataFrame into an Excel file
Call `.to_excel()` from the DataFrame object to save it as a “.xls” or “.xlsx” file.

In [16]:
df = pd.read_excel('../data/patent.xlsx',
                   sheet_name=1,
                   header=1 )
df.head()

Unnamed: 0,Court Province,Patents Quantity
0,广东省,1694
1,北京市,1230
2,浙江省,896
3,江苏省,553
4,山东省,399


In [17]:
df.to_excel("../data/patent2019.xlsx", index=True)

## Outputting a DataFrame into a JSON file

In [18]:
import pandas as pd

df = pd.read_excel('../data/patent.xlsx',
                   sheet_name=1,
                   header=1 )
df.head()

Unnamed: 0,Court Province,Patents Quantity
0,广东省,1694
1,北京市,1230
2,浙江省,896
3,江苏省,553
4,山东省,399


In [19]:
df.to_json("../data/patent2019.json", indent=True)

## Outputting a DataFrame into a html file

In [20]:
import pandas as pd

df = pd.read_excel('../data/patent.xlsx',
                   sheet_name=1,
                   header=1 )
df.head()

Unnamed: 0,Court Province,Patents Quantity
0,广东省,1694
1,北京市,1230
2,浙江省,896
3,江苏省,553
4,山东省,399


In [21]:
df.to_html("../data/patent2019.html")

In [22]:
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Court Province</th>\n      <th>Patents Quantity</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>广东省</td>\n      <td>1694</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>北京市</td>\n      <td>1230</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>浙江省</td>\n      <td>896</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>江苏省</td>\n      <td>553</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>山东省</td>\n      <td>399</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>河南省</td>\n      <td>247</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>四川省</td>\n      <td>227</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>福建省</td>\n      <td>171</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>湖南省</td>\n      <td>150</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>安徽省</td>\n      <td>144</td>\n    </tr>\n    <t

In [23]:
df.style.to_latex()

'\\begin{tabular}{llr}\n & Court Province & Patents Quantity \\\\\n0 & 广东省 & 1694 \\\\\n1 & 北京市 & 1230 \\\\\n2 & 浙江省 & 896 \\\\\n3 & 江苏省 & 553 \\\\\n4 & 山东省 & 399 \\\\\n5 & 河南省 & 247 \\\\\n6 & 四川省 & 227 \\\\\n7 & 福建省 & 171 \\\\\n8 & 湖南省 & 150 \\\\\n9 & 安徽省 & 144 \\\\\n10 & 湖北省 & 129 \\\\\n11 & 河北省 & 129 \\\\\n12 & 山西省 & 106 \\\\\n13 & 上海市 & 90 \\\\\n14 & 江西省 & 83 \\\\\n15 & 云南省 & 82 \\\\\n16 & 天津市 & 69 \\\\\n17 & 辽宁省 & 68 \\\\\n18 & 重庆市 & 60 \\\\\n19 & 陕西省 & 44 \\\\\n20 & 广西壮族自治区 & 42 \\\\\n21 & 新疆维吾尔自治区 & 37 \\\\\n22 & 贵州省 & 33 \\\\\n23 & 甘肃省 & 29 \\\\\n24 & 内蒙古自治区 & 29 \\\\\n25 & 青海省 & 27 \\\\\n26 & 吉林省 & 23 \\\\\n27 & 黑龙江省 & 16 \\\\\n28 & 海南省 & 10 \\\\\n29 & 宁夏回族自治区 & 8 \\\\\n30 & 西藏自治区 & 0 \\\\\n31 & 台湾省 & 0 \\\\\n\\end{tabular}\n'

In [24]:
df.to_markdown()

'|    | Court Province   |   Patents Quantity |\n|---:|:-----------------|-------------------:|\n|  0 | 广东省           |               1694 |\n|  1 | 北京市           |               1230 |\n|  2 | 浙江省           |                896 |\n|  3 | 江苏省           |                553 |\n|  4 | 山东省           |                399 |\n|  5 | 河南省           |                247 |\n|  6 | 四川省           |                227 |\n|  7 | 福建省           |                171 |\n|  8 | 湖南省           |                150 |\n|  9 | 安徽省           |                144 |\n| 10 | 湖北省           |                129 |\n| 11 | 河北省           |                129 |\n| 12 | 山西省           |                106 |\n| 13 | 上海市           |                 90 |\n| 14 | 江西省           |                 83 |\n| 15 | 云南省           |                 82 |\n| 16 | 天津市           |                 69 |\n| 17 | 辽宁省           |                 68 |\n| 18 | 重庆市           |                 60 |\n| 19 | 陕西省           |                 44 |\n| 2